[sldev] [PATCH] OpenJPEG2000: Tech Test with Intel's TBB

Dzonatas dzonatas at dzonux.net
Thu Nov 8 13:57:17 PST 2007


I've attached a very preliminary patch for a tech test of Intel's TBB 
with no extra optimizations or special tune-ups.

I need feedback on speed comparison between the current version of 
OpenJPEG200 to this one.

On a single processors, you may see about 5% slowdown.

On multi-processors (or HT), you should see it run at least 75%-50% of 
the time the current version normally runs.

Let me know if you need any help to test this version.

If this works well, I'll continue with this version and submit an option 
to build libopenjpeg.so with TBB.

Thank You

-- 
Power to Change the Void
-------------- next part --------------
Index: libopenjpeg/dwt.cpp
===================================================================
--- libopenjpeg/dwt.cpp	(revision 0)
+++ libopenjpeg/dwt.cpp	(revision 0)
@@ -0,0 +1,910 @@
+/*
+ * Copyright (c) 2002-2007, Communications and Remote Sensing Laboratory, Universite catholique de Louvain (UCL), Belgium
+ * Copyright (c) 2002-2007, Professor Benoit Macq
+ * Copyright (c) 2001-2003, David Janssens
+ * Copyright (c) 2002-2003, Yannick Verschueren
+ * Copyright (c) 2003-2007, Francois-Olivier Devaux and Antonin Descampe
+ * Copyright (c) 2005, Herve Drolon, FreeImage Team
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS'
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**********************************************************
+ * Copyright (c) 2007, Linden Research, Inc.
+ * This is a preliminary technology test to integrate
+ * Intel's Threading Building Blocks into OpenJPEG2000
+ * 
+ * Contributor(s) Second Life Identity:
+ * * Dzonatas Sol
+ */
+
+
+#include "tbb/task_scheduler_init.h"
+
+#include "tbb/parallel_for.h"
+#include "tbb/blocked_range.h"
+#include "tbb/concurrent_queue.h"
+
+extern "C" {
+#include "opj_includes.h"
+}
+
+
+namespace OPJ
+	{
+	namespace DWT
+        	{
+                class Wavelet
+                	{
+                	public:
+                	int *		mem ;
+                	int		dn ;
+                	int		sn ;
+                	int		cas ;
+                	
+                	void		Encode_5_3();
+                	void		Decode_5_3();
+                	void		Encode_9_7();
+                	void		Decode_9_7();
+                	void		Decode_sm(int k, int n, int x);
+                	void		Decode_dm(int k, int n, int x);
+                	void		Decode_sp(int k, int n, int x);
+                	void		Decode_dp(int k, int n, int x);
+                	void		Interleave(int*,int);
+                	void		Interleave(int*);
+                	void		Deinterleave(int*,int);
+                	void		Deinterleave(int*);
+                	}; 
+                
+                class Tile
+                	{
+                	public:
+                	Wavelet		h ;
+                	Wavelet		v ;
+
+                	void		Decode(opj_tcd_tilecomp_t * tilec, int stop, int qmfbid);
+                	};
+
+                class TileBlocks
+                	{
+                	public:
+                	tbb::concurrent_queue<int*> q;
+                	Wavelet		h ;
+                	Wavelet		v ;
+                	int		wavelet_size;
+                	int *		a ;
+                	int		w ;
+                	int		qmfbid ;
+                	int		rh ;
+                	int		rw ;
+
+                	struct Decode_h
+                		{
+                		TileBlocks *tb ;
+                		void operator()( const tbb::blocked_range<int>& range ) const;
+                		};
+                	struct Decode_v
+                    		{
+                    		TileBlocks *tb ;
+                    		void operator()( const tbb::blocked_range<int>& range ) const;
+                    		};
+
+                	void		Decode(opj_tcd_tilecomp_t * tilec, int stop, int qmfbid);
+                	int *		NewBuffer(void);
+                	};
+        	}
+	}
+
+extern "C" {
+/** @defgroup DWT DWT - Implementation of a discrete wavelet transform */
+/*@{*/
+
+
+/** @name Local static functions */
+/*@{*/
+
+/**
+Forward lazy transform (horizontal)
+*/
+static void dwt_deinterleave_h(int *a, int *b, int dn, int sn, int cas);
+/**
+Forward lazy transform (vertical)
+*/
+static void dwt_deinterleave_v(int *a, int *b, int dn, int sn, int x, int cas);
+/**
+Inverse lazy transform (horizontal)
+*/
+static void dwt_encode_1(int *a, int dn, int sn, int cas);
+/**
+Inverse 5-3 wavelet tranform in 1-D
+*/
+static void dwt_encode_1_real(int *a, int dn, int sn, int cas);
+/**
+Inverse 9-7 wavelet transform in 1-D
+*/
+static void dwt_encode_stepsize(int stepsize, int numbps, opj_stepsize_t *bandno_stepsize);
+
+/*@}*/
+
+/*@}*/
+
+#define S(i) a[(i)*2]
+#define D(i) a[(1+(i)*2)]
+#define S_(i) ((i)<0?S(0):((i)>=sn?S(sn-1):S(i)))
+#define D_(i) ((i)<0?D(0):((i)>=dn?D(dn-1):D(i)))
+/* new */
+#define SS_(i) ((i)<0?S(0):((i)>=dn?S(dn-1):S(i)))
+#define DD_(i) ((i)<0?D(0):((i)>=sn?D(sn-1):D(i)))
+
+/* <summary>                                                              */
+/* This table contains the norms of the 5-3 wavelets for different bands. */
+/* </summary>                                                             */
+static const double dwt_norms[4][10] = {
+	{1.000, 1.500, 2.750, 5.375, 10.68, 21.34, 42.67, 85.33, 170.7, 341.3},
+	{1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9},
+	{1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9},
+	{.7186, .9218, 1.586, 3.043, 6.019, 12.01, 24.00, 47.97, 95.93}
+};
+
+/* <summary>                                                              */
+/* This table contains the norms of the 9-7 wavelets for different bands. */
+/* </summary>                                                             */
+static const double dwt_norms_real[4][10] = {
+	{1.000, 1.965, 4.177, 8.403, 16.90, 33.84, 67.69, 135.3, 270.6, 540.9},
+	{2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0},
+	{2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0},
+	{2.080, 3.865, 8.307, 17.18, 34.71, 69.59, 139.3, 278.6, 557.2}
+};
+
+/* 
+==========================================================
+   local functions
+==========================================================
+*/
+
+/* <summary>			                 */
+/* Forward lazy transform (horizontal).  */
+/* </summary>                            */ 
+static void dwt_deinterleave_h(int *a, int *b, int dn, int sn, int cas) {
+	int i;
+    for (i=0; i<sn; i++) b[i]=a[2*i+cas];
+    for (i=0; i<dn; i++) b[sn+i]=a[(2*i+1-cas)];
+}
+
+/* <summary>                             */  
+/* Forward lazy transform (vertical).    */
+/* </summary>                            */ 
+static void dwt_deinterleave_v(int *a, int *b, int dn, int sn, int x, int cas) {
+    int i;
+    for (i=0; i<sn; i++) b[i*x]=a[2*i+cas];
+    for (i=0; i<dn; i++) b[(sn+i)*x]=a[(2*i+1-cas)];
+}
+
+
+
+/* <summary>                            */
+/* Forward 5-3 wavelet tranform in 1-D. */
+/* </summary>                           */
+static void dwt_encode_1(int *a, int dn, int sn, int cas) {
+	int i;
+	
+	if (!cas) {
+		if ((dn > 0) || (sn > 1)) {	/* NEW :  CASE ONE ELEMENT */
+			for (i = 0; i < dn; i++) D(i) -= (S_(i) + S_(i + 1)) >> 1;
+			for (i = 0; i < sn; i++) S(i) += (D_(i - 1) + D_(i) + 2) >> 2;
+		}
+	} else {
+		if (!sn && dn == 1)		    /* NEW :  CASE ONE ELEMENT */
+			S(0) *= 2;
+		else {
+			for (i = 0; i < dn; i++) S(i) -= (DD_(i) + DD_(i - 1)) >> 1;
+			for (i = 0; i < sn; i++) D(i) += (SS_(i) + SS_(i + 1) + 2) >> 2;
+		}
+	}
+}
+
+/* <summary>                            */
+/* Inverse 5-3 wavelet tranform in 1-D. */
+/* </summary>                           */ 
+static void dwt_decode_1_(int *a, int dn, int sn, int cas) {
+	int i;
+	
+	if (!cas) {
+		if ((dn > 0) || (sn > 1)) { /* NEW :  CASE ONE ELEMENT */
+			for (i = 0; i < sn; i++) S(i) -= (D_(i - 1) + D_(i) + 2) >> 2;
+			for (i = 0; i < dn; i++) D(i) += (S_(i) + S_(i + 1)) >> 1;
+		}
+	} else {
+		if (!sn  && dn == 1)          /* NEW :  CASE ONE ELEMENT */
+			S(0) /= 2;
+		else {
+			for (i = 0; i < sn; i++) D(i) -= (SS_(i) + SS_(i + 1) + 2) >> 2;
+			for (i = 0; i < dn; i++) S(i) += (DD_(i) + DD_(i - 1)) >> 1;
+		}
+	}
+}
+
+
+/* <summary>                             */
+/* Forward 9-7 wavelet transform in 1-D. */
+/* </summary>                            */
+static void dwt_encode_1_real(int *a, int dn, int sn, int cas) {
+	int i;
+	if (!cas) {
+		if ((dn > 0) || (sn > 1)) {	/* NEW :  CASE ONE ELEMENT */
+			for (i = 0; i < dn; i++)
+				D(i) -= fix_mul(S_(i) + S_(i + 1), 12993);
+			for (i = 0; i < sn; i++)
+				S(i) -= fix_mul(D_(i - 1) + D_(i), 434);
+			for (i = 0; i < dn; i++)
+				D(i) += fix_mul(S_(i) + S_(i + 1), 7233);
+			for (i = 0; i < sn; i++)
+				S(i) += fix_mul(D_(i - 1) + D_(i), 3633);
+			for (i = 0; i < dn; i++)
+				D(i) = fix_mul(D(i), 5038);	/*5038 */
+			for (i = 0; i < sn; i++)
+				S(i) = fix_mul(S(i), 6659);	/*6660 */
+		}
+	} else {
+		if ((sn > 0) || (dn > 1)) {	/* NEW :  CASE ONE ELEMENT */
+			for (i = 0; i < dn; i++)
+				S(i) -= fix_mul(DD_(i) + DD_(i - 1), 12993);
+			for (i = 0; i < sn; i++)
+				D(i) -= fix_mul(SS_(i) + SS_(i + 1), 434);
+			for (i = 0; i < dn; i++)
+				S(i) += fix_mul(DD_(i) + DD_(i - 1), 7233);
+			for (i = 0; i < sn; i++)
+				D(i) += fix_mul(SS_(i) + SS_(i + 1), 3633);
+			for (i = 0; i < dn; i++)
+				S(i) = fix_mul(S(i), 5038);	/*5038 */
+			for (i = 0; i < sn; i++)
+				D(i) = fix_mul(D(i), 6659);	/*6660 */
+		}
+	}
+}
+
+
+
+
+static void dwt_encode_stepsize(int stepsize, int numbps, opj_stepsize_t *bandno_stepsize) {
+	int p, n;
+	p = int_floorlog2(stepsize) - 13;
+	n = 11 - int_floorlog2(stepsize);
+	bandno_stepsize->mant = (n < 0 ? stepsize >> -n : stepsize << n) & 0x7ff;
+	bandno_stepsize->expn = numbps - p;
+}
+
+/* 
+==========================================================
+   DWT interface
+==========================================================
+*/
+
+/* <summary>                            */
+/* Forward 5-3 wavelet tranform in 2-D. */
+/* </summary>                           */
+void dwt_encode(opj_tcd_tilecomp_t * tilec) {
+	int i, j, k;
+	int *a = NULL;
+	int *aj = NULL;
+	int *bj = NULL;
+	int w, l;
+	
+	w = tilec->x1-tilec->x0;
+	l = tilec->numresolutions-1;
+	a = tilec->data;
+	
+	for (i = 0; i < l; i++) {
+		int rw;			/* width of the resolution level computed                                                           */
+		int rh;			/* heigth of the resolution level computed                                                          */
+		int rw1;		/* width of the resolution level once lower than computed one                                       */
+		int rh1;		/* height of the resolution level once lower than computed one                                      */
+		int cas_col;	/* 0 = non inversion on horizontal filtering 1 = inversion between low-pass and high-pass filtering */
+		int cas_row;	/* 0 = non inversion on vertical filtering 1 = inversion between low-pass and high-pass filtering   */
+		int dn, sn;
+		
+		rw = tilec->resolutions[l - i].x1 - tilec->resolutions[l - i].x0;
+		rh = tilec->resolutions[l - i].y1 - tilec->resolutions[l - i].y0;
+		rw1= tilec->resolutions[l - i - 1].x1 - tilec->resolutions[l - i - 1].x0;
+		rh1= tilec->resolutions[l - i - 1].y1 - tilec->resolutions[l - i - 1].y0;
+		
+		cas_row = tilec->resolutions[l - i].x0 % 2;
+		cas_col = tilec->resolutions[l - i].y0 % 2;
+        
+		sn = rh1;
+		dn = rh - rh1;
+		bj = (int*)opj_malloc(rh * sizeof(int));
+		for (j = 0; j < rw; j++) {
+			aj = a + j;
+			for (k = 0; k < rh; k++)  bj[k] = aj[k*w];
+			dwt_encode_1(bj, dn, sn, cas_col);
+			dwt_deinterleave_v(bj, aj, dn, sn, w, cas_col);
+		}
+		opj_free(bj);
+		
+		sn = rw1;
+		dn = rw - rw1;
+		bj = (int*)opj_malloc(rw * sizeof(int));
+		for (j = 0; j < rh; j++) {
+			aj = a + j * w;
+			for (k = 0; k < rw; k++)  bj[k] = aj[k];
+			dwt_encode_1(bj, dn, sn, cas_row);
+			dwt_deinterleave_h(bj, aj, dn, sn, cas_row);
+		}
+		opj_free(bj);
+	}
+}
+
+
+/* <summary>                          */
+/* Get gain of 5-3 wavelet transform. */
+/* </summary>                         */
+int dwt_getgain(int orient) {
+	if (orient == 0)
+		return 0;
+	if (orient == 1 || orient == 2)
+		return 1;
+	return 2;
+}
+
+/* <summary>                */
+/* Get norm of 5-3 wavelet. */
+/* </summary>               */
+double dwt_getnorm(int level, int orient) {
+	return dwt_norms[orient][level];
+}
+
+/* <summary>                             */
+/* Forward 9-7 wavelet transform in 2-D. */
+/* </summary>                            */
+
+void dwt_encode_real(opj_tcd_tilecomp_t * tilec) {
+	int i, j, k;
+	int *a = NULL;
+	int *aj = NULL;
+	int *bj = NULL;
+	int w, l;
+	
+	w = tilec->x1-tilec->x0;
+	l = tilec->numresolutions-1;
+	a = tilec->data;
+	
+	for (i = 0; i < l; i++) {
+		int rw;			/* width of the resolution level computed                                                     */
+		int rh;			/* heigth of the resolution level computed                                                    */
+		int rw1;		/* width of the resolution level once lower than computed one                                 */
+		int rh1;		/* height of the resolution level once lower than computed one                                */
+		int cas_col;	/* 0 = non inversion on horizontal filtering 1 = inversion between low-pass and high-pass filtering */
+		int cas_row;	/* 0 = non inversion on vertical filtering 1 = inversion between low-pass and high-pass filtering   */
+		int dn, sn;
+		
+		rw = tilec->resolutions[l - i].x1 - tilec->resolutions[l - i].x0;
+		rh = tilec->resolutions[l - i].y1 - tilec->resolutions[l - i].y0;
+		rw1= tilec->resolutions[l - i - 1].x1 - tilec->resolutions[l - i - 1].x0;
+		rh1= tilec->resolutions[l - i - 1].y1 - tilec->resolutions[l - i - 1].y0;
+		
+		cas_row = tilec->resolutions[l - i].x0 % 2;
+		cas_col = tilec->resolutions[l - i].y0 % 2;
+		
+		sn = rh1;
+		dn = rh - rh1;
+		bj = (int*)opj_malloc(rh * sizeof(int));
+		for (j = 0; j < rw; j++) {
+			aj = a + j;
+			for (k = 0; k < rh; k++)  bj[k] = aj[k*w];
+			dwt_encode_1_real(bj, dn, sn, cas_col);
+			dwt_deinterleave_v(bj, aj, dn, sn, w, cas_col);
+		}
+		opj_free(bj);
+		
+		sn = rw1;
+		dn = rw - rw1;
+		bj = (int*)opj_malloc(rw * sizeof(int));
+		for (j = 0; j < rh; j++) {
+			aj = a + j * w;
+			for (k = 0; k < rw; k++)  bj[k] = aj[k];
+			dwt_encode_1_real(bj, dn, sn, cas_row);
+			dwt_deinterleave_h(bj, aj, dn, sn, cas_row);
+		}
+		opj_free(bj);
+	}
+}
+
+/* <summary>                            */
+/* Inverse 5-3 wavelet tranform in 2-D. */
+/* </summary>                           */
+void dwt_decode(opj_tcd_tilecomp_t * tilec, int stop)
+	{
+//	OPJ::DWT::Tile t;
+	OPJ::DWT::TileBlocks t;
+	t.Decode(tilec, stop, 1);
+	}
+
+
+/* <summary>                             */
+/* Inverse 9-7 wavelet transform in 2-D. */
+/* </summary>                            */
+void dwt_decode_real(opj_tcd_tilecomp_t * tilec, int stop)
+	{
+//	OPJ::DWT::Tile t;
+	OPJ::DWT::TileBlocks t;
+	t.Decode(tilec, stop, 0);
+	}
+
+
+/* <summary>                          */
+/* Get gain of 9-7 wavelet transform. */
+/* </summary>                         */
+int dwt_getgain_real(int orient) {
+	(void)orient;
+	return 0;
+}
+
+/* <summary>                */
+/* Get norm of 9-7 wavelet. */
+/* </summary>               */
+double dwt_getnorm_real(int level, int orient) {
+	return dwt_norms_real[orient][level];
+}
+
+void dwt_calc_explicit_stepsizes(opj_tccp_t * tccp, int prec) {
+	int numbands, bandno;
+	numbands = 3 * tccp->numresolutions - 2;
+	for (bandno = 0; bandno < numbands; bandno++) {
+		double stepsize;
+		int resno, level, orient, gain;
+
+		resno = (bandno == 0) ? 0 : ((bandno - 1) / 3 + 1);
+		orient = (bandno == 0) ? 0 : ((bandno - 1) % 3 + 1);
+		level = tccp->numresolutions - 1 - resno;
+		gain = (tccp->qmfbid == 0) ? 0 : ((orient == 0) ? 0 : (((orient == 1) || (orient == 2)) ? 1 : 2));
+		if (tccp->qntsty == J2K_CCP_QNTSTY_NOQNT) {
+			stepsize = 1.0;
+		} else {
+			double norm = dwt_norms_real[orient][level];
+			stepsize = (1 << (gain)) / norm;
+		}
+		dwt_encode_stepsize((int) floor(stepsize * 8192.0), prec + gain, &tccp->stepsizes[bandno]);
+	}
+}
+
+
+/* <summary>                             */
+/* Determine maximum computed resolution level for inverse wavelet transform */
+/* </summary>                            */
+static int dwt_decode_max_resolution(opj_tcd_resolution_t* r, int i)
+    	{
+	int mr	= 1;
+	int w;
+	while( --i ) {
+		r++;
+		if( mr < ( w = r->x1 - r->x0 ) )
+			mr = w ;
+		if( mr < ( w = r->y1 - r->y0 ) )
+			mr = w ;
+	}
+	return mr ;
+	}
+
+} // extern "C"
+
+
+
+
+//-------------
+
+
+inline void OPJ::DWT::Wavelet::Encode_5_3()
+    {
+    dwt_encode_1(mem, dn, sn, cas);
+    }
+
+inline void OPJ::DWT::Wavelet::Encode_9_7()
+    {
+    dwt_encode_1_real(mem, dn, sn, cas);
+    }
+
+
+inline void OPJ::DWT::Wavelet::Decode_5_3()
+    {
+    dwt_decode_1_(mem, dn, sn, cas);
+    }
+
+
+#define		WS(i)  mem[(i)*2]
+#define		WD(i)  mem[(1+(i)*2)]
+
+
+inline void OPJ::DWT::Wavelet::Decode_sm(int k, int n, int x) {
+	int m = k > n ? n : k;
+	int l = mem[1];			//D(0);
+	int j;
+	int i;
+	for (i = 0; i < m; i++) {
+		j = l;
+		WS(i) -= fix_mul( ( l = WD(i) ) + j , x);
+	}
+	if( i < k ) {
+		l = fix_mul( l + l , x );
+		for (; i < k; i++)
+			WS(i) -= l;
+	}
+}
+
+inline void OPJ::DWT::Wavelet::Decode_sp(int k, int n, int x) {
+	int m = k > n ? n : k;
+	int l = mem[1];			//D(0);
+	int j;
+	int i;
+	for (i = 0; i < m; i++) {
+		j = l;
+		WS(i) += fix_mul( ( l = WD(i) ) + j , x);
+	}
+	if( i < k ) {
+		l = fix_mul( l + l , x );
+		for (; i < k; i++)
+			WS(i) += l;
+	}
+}
+
+inline void OPJ::DWT::Wavelet::Decode_dm(int k, int n, int x) {
+	int m = k >= n ? n-1 : k;
+	int l = mem[0];				//S(0);
+	int i;
+	int j;
+	for (i = 0; i < m; i++) {
+		j = l;
+		WD(i) -=  fix_mul( ( l = WS(i+1) ) + j , x);
+	}
+	if( i < k ) {
+		l = fix_mul( l + l , x );
+		for (; i < k; i++)
+			WD(i) -= l;
+	}
+}
+
+inline void OPJ::DWT::Wavelet::Decode_dp(int k, int n, int x)	{
+	int m = k >= n ? n-1 : k;
+	int l = mem[0];				//S(0);
+	int i;
+	int j;
+	for (i = 0; i < m; i++) {
+		j = l;
+		WD(i) +=  fix_mul( ( l = WS(i+1) ) + j , x);
+	}
+
+	if( i < k ) {
+		l = fix_mul( l + l , x );
+		for (; i < k; i++)
+			WD(i) += l;
+	}
+}
+
+
+inline void OPJ::DWT::Wavelet::Decode_9_7()
+    	{
+	int i;
+	if (!cas)
+	    	{
+		if ((dn > 0) || (sn > 1))
+		    	{
+			for (i = 0; i < sn; i++)
+				WS(i) = fix_mul(WS(i), 10078);
+			for (i = 0; i < dn; i++)
+				WD(i) = fix_mul(WD(i), 13318);
+			Decode_sm(sn, dn, 3633);
+			Decode_dm(dn, sn, 7233);
+			Decode_sp(sn, dn, 434);
+			Decode_dp(dn, sn, 12994);
+			}
+	    	}
+	else	{
+		if ((sn > 0) || (dn > 1))
+		    	{
+			for (i = 0; i < sn; i++)
+				WD(i) = fix_mul(WD(i), 10078);
+			for (i = 0; i < dn; i++)
+				WS(i) = fix_mul(WS(i), 13318);
+			Decode_dm(sn, dn, 3633);
+			Decode_sm(dn, sn, 7233);
+			Decode_dp(sn, dn, 434);
+			Decode_sp(dn, sn, 12994);
+		    	}
+		}
+    	}
+
+
+inline void OPJ::DWT::Wavelet::Deinterleave(int* a)
+    	{
+	int i;
+        for (i=0; i<sn; i++) mem[i]=a[2*i+cas];
+        for (i=0; i<dn; i++) mem[sn+i]=a[(2*i+1-cas)];
+        }
+
+
+inline void OPJ::DWT::Wavelet::Deinterleave(int* a, int x)
+        {
+        int i;
+        for (i=0; i<sn; i++) mem[i*x]=a[2*i+cas];
+        for (i=0; i<dn; i++) mem[(sn+i)*x]=a[(2*i+1-cas)];
+        }
+
+
+inline void OPJ::DWT::Wavelet::Interleave(int* a, int x)
+    	{
+        int *ai = a;
+        int *bi = mem + cas;
+        int  i = sn;
+        while( i-- )
+                {
+                *bi = *ai ;
+                bi += 2 ;
+                ai += x ;
+                }
+        ai = a + (sn * x);
+        bi = mem + 1 - cas;
+        i = dn ;
+        while( i-- )
+                {
+                *bi = *ai;
+                bi += 2;  
+                ai += x;
+                }
+    	}
+
+
+inline void OPJ::DWT::Wavelet::Interleave( int* a )
+	{
+        int *ai = a;
+        int *bi = mem + cas;
+        int  i	= sn;
+        while( i-- )
+            	{
+            	*bi = *(ai++);
+            	bi += 2;
+            	}
+        ai	= a + sn;
+        bi	= mem + 1 - cas;
+        i	= dn ;
+        while( i-- )
+            	{
+            	*bi = *(ai++);
+            	bi += 2;
+        	}
+	}
+
+
+void OPJ::DWT::Tile::Decode(opj_tcd_tilecomp_t * _tilec, int _stop, int _qmfbid)
+	{
+	opj_tcd_resolution_t* tr;
+	int i, j, k;
+	int *a = NULL;
+	int *aj = NULL;
+	int w; //, l;
+	int rw;			/* width of the resolution level computed  */
+	int rh;			/* heigth of the resolution level computed  */
+	
+	if( 1 > ( i = _tilec->numresolutions - _stop ) )
+		return ;
+
+	tr = _tilec->resolutions;
+
+	w = _tilec->x1 - _tilec->x0;
+	a = _tilec->data;
+
+	h.mem = (int*) opj_aligned_malloc(dwt_decode_max_resolution(tr, i) * sizeof(int));
+	v.mem = h.mem;
+
+	rw = tr->x1 - tr->x0;
+	rh = tr->y1 - tr->y0;
+
+	while( --i )
+	    	{
+		tr++;
+		h.sn = rw;
+		v.sn = rh;
+		h.dn = ( rw = tr->x1 - tr->x0 ) - h.sn;
+		v.dn = ( rh = tr->y1 - tr->y0 ) - v.sn;
+				
+		h.cas = tr->x0 % 2;
+		v.cas = tr->y0 % 2;
+
+		if( _qmfbid == 1 )
+		    	{
+        		aj = a;
+        		j = rh;
+        		while( j-- )
+        		    	{
+        		    	h.Interleave(aj);
+        		    	h.Decode_5_3();
+        			k = rw;
+        			while( k-- )
+        				aj[k] = h.mem[k];
+        			aj += w;
+        			}
+        		aj = a;
+        		j = rw;
+        		while( j-- )
+        		    	{
+        			v.Interleave(aj, w);
+        		    	v.Decode_5_3();
+        			k = rh;
+        			while( k-- )
+        				aj[k * w] = v.mem[k];
+        			aj++;
+        		    	}
+		    	}
+		else	{
+        		aj = a;
+        		j = rh;
+        		while( j-- )
+        		    	{
+        			h.Interleave(aj);
+        			h.Decode_9_7();
+        			k = rw;
+        			while( k-- )
+        				aj[k] = h.mem[k];
+        			aj += w;
+        		    	}
+        		aj = a;
+        		j = rw;
+        		while( j-- )
+        		    	{
+        			v.Interleave(aj, w);
+        		    	v.Decode_9_7();
+        			k = rh;
+        			while( k-- )
+        				aj[k * w] = v.mem[k];
+        			aj++;
+        		    	}
+			}
+		}
+	opj_aligned_free(h.mem);
+	}
+
+
+inline int * OPJ::DWT::TileBlocks::NewBuffer()
+	{
+	return (int*)opj_aligned_malloc(wavelet_size);
+	}
+
+
+void OPJ::DWT::TileBlocks::Decode_h::operator()( const tbb::blocked_range<int>& range ) const
+	{
+	Wavelet wavelet = tb->h;
+	int *	aj = NULL;
+	int	j, k;
+	int	w = tb->w ;
+	int	rw = tb->rw;
+	
+//	std::cout << "h" << range.begin() << ":" << range.end() << std::endl ;
+	
+	if( !tb->q.pop_if_present( wavelet.mem ) )
+	    wavelet.mem = tb->NewBuffer();
+	aj = tb->a + (tb->w * range.begin());
+	j = range.end() - range.begin();
+	if( tb->qmfbid )
+        	while( j-- )
+        	    	{
+        	    	wavelet.Interleave(aj);
+        	    	wavelet.Decode_5_3();
+        		k = rw;
+        		while( k-- )
+        			aj[k] = wavelet.mem[k];
+        		aj += w;
+        		}
+	else
+        	while( j-- )
+        	    	{
+        	    	wavelet.Interleave(aj);
+        	    	wavelet.Decode_9_7();
+        		k = rw;
+        		while( k-- )
+        			aj[k] = wavelet.mem[k];
+        		aj += w;
+        		}
+	tb->q.push( wavelet.mem );
+	}
+
+
+void OPJ::DWT::TileBlocks::Decode_v::operator()( const tbb::blocked_range<int>& range ) const
+	{
+	Wavelet wavelet = tb->v;
+	int *	aj = NULL;
+	int	j, k;
+	int	w = tb->w ;
+	int	rh = tb->rh;
+	
+	if( !tb->q.pop_if_present( wavelet.mem ) )
+	    wavelet.mem = tb->NewBuffer();
+	aj = tb->a + range.begin();
+	j = range.end() - range.begin();
+	if( tb->qmfbid )
+        	while( j-- )
+        	    	{
+        		wavelet.Interleave(aj, w);
+        	    	wavelet.Decode_5_3();
+        		k = rh;
+        		while( k-- )
+        			aj[k * w] = wavelet.mem[k];
+        		aj++;
+        	    	}
+	else
+        	while( j-- )
+        	    	{
+        		wavelet.Interleave(aj, w);
+        	    	wavelet.Decode_9_7();
+        		k = rh;
+        		while( k-- )
+        			aj[k * w] = wavelet.mem[k];
+        		aj++;
+        	    	}
+	tb->q.push( wavelet.mem );
+	}
+
+int tinit = 0 ;
+tbb::task_scheduler_init init;
+	
+void OPJ::DWT::TileBlocks::Decode(opj_tcd_tilecomp_t * _tilec, int _stop, int _qmfbid)
+	{
+	opj_tcd_resolution_t* tr;
+	Decode_h decode_h;
+	Decode_v decode_v;
+	int i ;
+	
+	if( 1 > ( i = _tilec->numresolutions - _stop ) )
+		return ;
+
+	tr = _tilec->resolutions;
+
+	w = _tilec->x1 - _tilec->x0;
+	a = _tilec->data;
+	qmfbid = _qmfbid ;
+	wavelet_size = dwt_decode_max_resolution(tr, i) * sizeof(int);
+	decode_h.tb = this;
+	decode_v.tb = this;
+
+	rw = tr->x1 - tr->x0;
+	rh = tr->y1 - tr->y0;
+
+	while( --i )
+	    	{
+		tr++;
+		h.sn = rw;
+		v.sn = rh;
+		h.dn = ( rw = tr->x1 - tr->x0 ) - h.sn;
+		v.dn = ( rh = tr->y1 - tr->y0 ) - v.sn;
+				
+		h.cas = tr->x0 % 2;
+		v.cas = tr->y0 % 2;
+		
+	    	tbb::parallel_for( tbb::blocked_range<int>( 0, rh, 4 ), decode_h);
+	    	tbb::parallel_for( tbb::blocked_range<int>( 0, rw, 4 ), decode_v);
+		}
+	while( !q.empty() )
+        	{
+        	int * m ;
+        	q.pop( m );
+        	opj_aligned_free(m);
+        	}
+	}
Index: codec/Makefile
===================================================================
--- codec/Makefile	(revision 470)
+++ codec/Makefile	(working copy)
@@ -1,6 +1,6 @@
 # Makefile for the main OpenJPEG codecs: j2k_to_image and image_to_j2k
 
-CFLAGS = -O3 -lstdc++ # -g -p -pg
+CFLAGS = -lstdc++ -ltbb -fPIC -msse2 -mfpmath=sse -pg -O2 # -g -p -pg 
 
 all: j2k_to_image image_to_j2k
 
Index: Makefile
===================================================================
--- Makefile	(revision 470)
+++ Makefile	(working copy)
@@ -3,7 +3,7 @@
 VER_MAJOR = 2
 VER_MINOR = 1.2.0
 
-SRCS = ./libopenjpeg/bio.c ./libopenjpeg/cio.c ./libopenjpeg/dwt.c ./libopenjpeg/event.c ./libopenjpeg/image.c ./libopenjpeg/j2k.c ./libopenjpeg/j2k_lib.c ./libopenjpeg/jp2.c ./libopenjpeg/jpt.c ./libopenjpeg/mct.c ./libopenjpeg/mqc.c ./libopenjpeg/openjpeg.c ./libopenjpeg/pi.c ./libopenjpeg/raw.c ./libopenjpeg/t1.c ./libopenjpeg/t2.c ./libopenjpeg/tcd.c ./libopenjpeg/tgt.c
+SRCS = ./libopenjpeg/bio.c ./libopenjpeg/cio.c ./libopenjpeg/dwt.cpp ./libopenjpeg/event.c ./libopenjpeg/image.c ./libopenjpeg/j2k.c ./libopenjpeg/j2k_lib.c ./libopenjpeg/jp2.c ./libopenjpeg/jpt.c ./libopenjpeg/mct.c ./libopenjpeg/mqc.c ./libopenjpeg/openjpeg.c ./libopenjpeg/pi.c ./libopenjpeg/raw.c ./libopenjpeg/t1.c ./libopenjpeg/t2.c ./libopenjpeg/tcd.c ./libopenjpeg/tgt.c
 INCLS = ./libopenjpeg/bio.h ./libopenjpeg/cio.h ./libopenjpeg/dwt.h ./libopenjpeg/event.h ./libopenjpeg/fix.h ./libopenjpeg/image.h ./libopenjpeg/int.h ./libopenjpeg/j2k.h ./libopenjpeg/j2k_lib.h ./libopenjpeg/jp2.h ./libopenjpeg/jpt.h ./libopenjpeg/mct.h ./libopenjpeg/mqc.h ./libopenjpeg/openjpeg.h ./libopenjpeg/pi.h ./libopenjpeg/raw.h ./libopenjpeg/t1.h ./libopenjpeg/t2.h ./libopenjpeg/tcd.h ./libopenjpeg/tgt.h ./libopenjpeg/opj_malloc.h ./libopenjpeg/opj_includes.h 
 INCLUDE = -Ilibopenjpeg
 
@@ -18,10 +18,13 @@
 # Converts cr/lf to just lf
 DOS2UNIX = dos2unix
 
-COMPILERFLAGS = -O3 -fPIC
-LIBRARIES = -lstdc++
+COMPILERFLAGS = -fPIC -msse2 -mfpmath=sse -pg -O2
+LIBRARIES = -lstdc++ -ltbb
 
-MODULES = $(SRCS:.c=.o)
+SRCSC = $(filter %.c,$(SRCS))
+SRCSCPP = $(filter %.cpp,$(SRCS))
+MODULES1 = $(SRCS:.cpp=.o) 
+MODULES = $(MODULES1:.c=.o) 
 CFLAGS = $(COMPILERFLAGS) $(INCLUDE)
 
 TARGET  = openjpeg
@@ -47,9 +50,12 @@
 
 OpenJPEG: $(STATICLIB) $(SHAREDLIB)
 
-.c.o:
+%.o: %.cpp
 	$(CC) $(CFLAGS) -c $< -o $@
 
+$(SRCSC:%.c=%.o): 
+	$(CC) $(CFLAGS) -c $*.c -o $@
+
 $(STATICLIB): $(MODULES)
 	$(AR) r $@ $(MODULES)
 


More information about the SLDev mailing list