[sldev] [patch] OpenJPEG DWT optimizations

Dzonatas dzonatas at dzonux.net
Mon Apr 2 23:40:11 PDT 2007


Hello,

I believe this patch is ready. I have spent much time to test very 
little changes to make sure the the output of this optimized version 
matches the previous version found in the svn trunk on www.openjpeg.org. 
I have not managed to obtain a collection of j2k files to test every 
known case; however, I merged similar functions together to help test 
different formats in a single run.

This version adds optimization, but it does not add GCC4 vector ops. I 
wanted to make sure this patch can easily be compiled by other versions 
and compilers. It still uses the same basic principles for fixed-point 
integer math, but many redundant calculations have been moved around to 
a more optimal place. There are more optimizations that can be done; 
however, I traded those extra optimization steps for readability.

Results:

"ed_05000.jp2" is 652K in compressed form and 6MB uncompressed

decode of "ed_05000.jp2" without attached patch:
[INFO] - dwt took 2.750000 s
real    0m15.784s
user    0m5.550s
sys     0m10.340s

decode of "ed_05000.jp2" with attached patch
[INFO] - dwt took 1.870000 s
real    0m12.304s
user    0m6.570s
sys     0m5.610s

About 3.5 seconds faster with the patch.

"bird.jp2" is 156K in compressed form and 18MB uncompressed

decode of "bird.jp2" without attached patch:
[INFO] - dwt took 11.190000 s
real    0m38.438s
user    0m12.170s
sys     0m26.520s

decode of "bird.jp2" with attached patch
[INFO] - dwt took 10.350000 s
real    0m28.572s
user    0m9.590s
sys     0m18.030s

About 10 seconds faster with the patch.

In both of these cases, the "user" time deltas reflects the amount of 
optimization in time relative to the DWT. The "sys" time deltas reflects 
the amount of relief the optimization causes such as less cache hits, 
page faults, context switches, etc.

The overall performance gain is estimated to about 30%.

I hope you like the attached patch. =)

-- 
-------------- next part --------------
Index: libopenjpeg/dwt.c
===================================================================
--- libopenjpeg/dwt.c	(revision 369)
+++ libopenjpeg/dwt.c	(working copy)
@@ -56,6 +56,24 @@
 /** @defgroup DWT DWT - Implementation of a discrete wavelet transform */
 /*@{*/
 
+/** @name Local data structures */
+/*@{*/
+
+typedef struct
+	{
+	int *	mem ;
+	int		dn ;
+	int		sn ;
+	int		cas ;
+	} dwt_t ; 
+
+/*@}*/
+
+/**
+Virtual function type for wavelet transform in 1-D 
+*/
+typedef void (*DWT1DFN)(dwt_t* v);
+
 /** @name Local static functions */
 /*@{*/
 
@@ -70,11 +88,11 @@
 /**
 Inverse lazy transform (horizontal)
 */
-static void dwt_interleave_h(int *a, int *b, int dn, int sn, int cas);
+static void dwt_interleave_h(dwt_t* h, int *a);
 /**
 Inverse lazy transform (vertical)
 */
-static void dwt_interleave_v(int *a, int *b, int dn, int sn, int x, int cas);
+static void dwt_interleave_v(dwt_t* v, int *a, int x);
 /**
 Forward 5-3 wavelet tranform in 1-D
 */
@@ -82,7 +100,7 @@
 /**
 Inverse 5-3 wavelet tranform in 1-D
 */
-static void dwt_decode_1(int *a, int dn, int sn, int cas);
+static void dwt_decode_1(dwt_t *v);
 /**
 Forward 9-7 wavelet transform in 1-D
 */
@@ -90,7 +108,7 @@
 /**
 Inverse 9-7 wavelet transform in 1-D
 */
-static void dwt_decode_1_real(int *a, int dn, int sn, int cas);
+static void dwt_decode_1_real(dwt_t *v);
 /**
 FIXME : comment ???
 */
@@ -95,6 +113,10 @@
 FIXME : comment ???
 */
 static void dwt_encode_stepsize(int stepsize, int numbps, opj_stepsize_t *bandno_stepsize);
+/**
+Inverse wavelet tranform in 2-D.
+*/
+static void dwt_decode_tile(opj_tcd_tilecomp_t * tilec, int stop , DWT1DFN fn);
 
 /*@}*/
 
@@ -155,23 +177,20 @@
 /* <summary>                             */
 /* Inverse lazy transform (horizontal).  */
 /* </summary>                            */
-static void dwt_interleave_h(int *a, int *b, int dn, int sn, int cas) {
-    int i;
-    int *ai = NULL;
-    int *bi = NULL;
-    ai = a;
-    bi = b + cas;
-    for (i = 0; i < sn; i++) {
-      *bi = *ai;  
-	  bi += 2;  
-	  ai++;
+static void dwt_interleave_h(dwt_t* h, int *a) {
+    int *ai = a;
+    int *bi = h->mem + h->cas;
+    int  i	= h->sn;
+    while( i-- ) {
+      *bi = *(ai++);
+	  bi += 2;
     }
-    ai = a + sn;
-    bi = b + 1 - cas;
-    for (i = 0; i < dn; i++) {
-      *bi = *ai;
+    ai	= a + h->sn;
+    bi	= h->mem + 1 - h->cas;
+    i	= h->dn ;
+    while( i-- ) {
+      *bi = *(ai++);
 	  bi += 2;
-	  ai++;
     }
 }
 
@@ -178,13 +197,11 @@
 /* <summary>                             */  
 /* Inverse lazy transform (vertical).    */
 /* </summary>                            */ 
-static void dwt_interleave_v(int *a, int *b, int dn, int sn, int x, int cas) {
-    int i;
-    int *ai = NULL;
-    int *bi = NULL;
-    ai = a;
-    bi = b + cas;
-    for (i = 0; i < sn; i++) {
+static void dwt_interleave_v(dwt_t* v, int *a, int x) {
+    int *ai = a;
+    int *bi = v->mem + v->cas;
+    int  i = v->sn;
+    while( i-- ) {
       *bi = *ai;
 	  bi += 2;
 	  ai += x;
@@ -189,9 +206,10 @@
 	  bi += 2;
 	  ai += x;
     }
-    ai = a + (sn * x);
-    bi = b + 1 - cas;
-    for (i = 0; i < dn; i++) {
+    ai = a + (v->sn * x);
+    bi = v->mem + 1 - v->cas;
+    i = v->dn ;
+    while( i-- ) {
       *bi = *ai;
 	  bi += 2;  
 	  ai += x;
@@ -223,7 +241,7 @@
 /* <summary>                            */
 /* Inverse 5-3 wavelet tranform in 1-D. */
 /* </summary>                           */ 
-static void dwt_decode_1(int *a, int dn, int sn, int cas) {
+static void dwt_decode_1_(int *a, int dn, int sn, int cas) {
 	int i;
 	
 	if (!cas) {
@@ -241,6 +259,13 @@
 	}
 }
 
+/* <summary>                            */
+/* Inverse 5-3 wavelet tranform in 1-D. */
+/* </summary>                           */ 
+static void dwt_decode_1(dwt_t *v) {
+	dwt_decode_1_(v->mem, v->dn, v->sn, v->cas);
+}
+
 /* <summary>                             */
 /* Forward 9-7 wavelet transform in 1-D. */
 /* </summary>                            */
@@ -279,40 +304,101 @@
 	}
 }
 
+#define WS(i) v->mem[(i)*2]
+#define WD(i) v->mem[(1+(i)*2)]
+
+static void dwt_decode_sm(dwt_t* v, int k, int n, int x) {
+	int m = k > n ? n : k;
+	int l = v->mem[1];			//D(0);
+	int j;
+	int i;
+	for (i = 0; i < m; i++) {
+		j = l;
+		WS(i) -= fix_mul( ( l = WD(i) ) + j , x);
+	}
+	if( i < k ) {
+		l = fix_mul( l + l , x );
+		for (; i < k; i++)
+			WS(i) -= l;
+	}
+}
+
+static void dwt_decode_sp(dwt_t* v, int k, int n, int x) {
+	int m = k > n ? n : k;
+	int l = v->mem[1];			//D(0);
+	int j;
+	int i;
+	for (i = 0; i < m; i++) {
+		j = l;
+		WS(i) += fix_mul( ( l = WD(i) ) + j , x);
+	}
+	if( i < k ) {
+		l = fix_mul( l + l , x );
+		for (; i < k; i++)
+			WS(i) += l;
+	}
+}
+
+static void dwt_decode_dm(dwt_t* v, int k, int n, int x) {
+	int m = k >= n ? n-1 : k;
+	int l = v->mem[0];				//S(0);
+	int i;
+	int j;
+	for (i = 0; i < m; i++) {
+		j = l;
+		WD(i) -=  fix_mul( ( l = WS(i+1) ) + j , x);
+	}
+	if( i < k ) {
+		l = fix_mul( l + l , x );
+		for (; i < k; i++)
+			WD(i) -= l;
+	}
+}
+
+static void dwt_decode_dp(dwt_t* v, int k, int n, int x) {
+	int m = k >= n ? n-1 : k;
+	int l = v->mem[0];				//S(0);
+	int i;
+	int j;
+	for (i = 0; i < m; i++) {
+		j = l;
+		WD(i) +=  fix_mul( ( l = WS(i+1) ) + j , x);
+	}
+
+	if( i < k ) {
+		l = fix_mul( l + l , x );
+		for (; i < k; i++)
+			WD(i) += l;
+	}
+}
+
+
 /* <summary>                             */
 /* Inverse 9-7 wavelet transform in 1-D. */
 /* </summary>                            */
-static void dwt_decode_1_real(int *a, int dn, int sn, int cas) {
+static void dwt_decode_1_real(dwt_t* v) {
 	int i;
-	if (!cas) {
-		if ((dn > 0) || (sn > 1)) {	/* NEW :  CASE ONE ELEMENT */
-			for (i = 0; i < sn; i++)
-				S(i) = fix_mul(S(i), 10078);	/* 10076 */
-			for (i = 0; i < dn; i++)
-				D(i) = fix_mul(D(i), 13318);	/* 13320 */
-			for (i = 0; i < sn; i++)
-				S(i) -= fix_mul(D_(i - 1) + D_(i), 3633);
-			for (i = 0; i < dn; i++)
-				D(i) -= fix_mul(S_(i) + S_(i + 1), 7233);
-			for (i = 0; i < sn; i++)
-				S(i) += fix_mul(D_(i - 1) + D_(i), 434);
-			for (i = 0; i < dn; i++)
-				D(i) += fix_mul(S_(i) + S_(i + 1), 12994);	/* 12993 */
+	if (!v->cas) {
+		if ((v->dn > 0) || (v->sn > 1)) {	/* NEW :  CASE ONE ELEMENT */
+			for (i = 0; i < v->sn; i++)
+				WS(i) = fix_mul(WS(i), 10078);	/* 10076 */
+			for (i = 0; i < v->dn; i++)
+				WD(i) = fix_mul(WD(i), 13318);	/* 13320 */
+			dwt_decode_sm(v, v->sn, v->dn, 3633);
+			dwt_decode_dm(v, v->dn, v->sn, 7233);
+			dwt_decode_sp(v, v->sn, v->dn, 434);
+			dwt_decode_dp(v, v->dn, v->sn, 12994);
 		}
 	} else {
-		if ((sn > 0) || (dn > 1)) {	/* NEW :  CASE ONE ELEMENT */
-			for (i = 0; i < sn; i++)
-				D(i) = fix_mul(D(i), 10078);	/* 10076 */
-			for (i = 0; i < dn; i++)
-				S(i) = fix_mul(S(i), 13318);	/* 13320 */
-			for (i = 0; i < sn; i++)
-				D(i) -= fix_mul(SS_(i) + SS_(i + 1), 3633);
-			for (i = 0; i < dn; i++)
-				S(i) -= fix_mul(DD_(i) + DD_(i - 1), 7233);
-			for (i = 0; i < sn; i++)
-				D(i) += fix_mul(SS_(i) + SS_(i + 1), 434);
-			for (i = 0; i < dn; i++)
-				S(i) += fix_mul(DD_(i) + DD_(i - 1), 12994);	/* 12993 */
+		if ((v->sn > 0) || (v->dn > 1)) {	/* NEW :  CASE ONE ELEMENT */
+			for (i = 0; i < v->sn; i++)
+				WD(i) = fix_mul(WD(i), 10078);	/* 10076 */
+			for (i = 0; i < v->dn; i++)
+				WS(i) = fix_mul(WS(i), 13318);	/* 13320 */
+			dwt_decode_dm(v, v->sn, v->dn, 3633);
+			dwt_decode_sm(v, v->dn, v->sn, 7233);
+			dwt_decode_dp(v, v->sn, v->dn, 434);
+			dwt_decode_sp(v, v->dn, v->sn, 12994);
 		}
 	}
 }
@@ -391,55 +477,7 @@
 /* Inverse 5-3 wavelet tranform in 2-D. */
 /* </summary>                           */
 void dwt_decode(opj_tcd_tilecomp_t * tilec, int stop) {
-	int i, j, k;
-	int *a = NULL;
-	int *aj = NULL;
-	int *bj = NULL;
-	int w, l;
-	
-	w = tilec->x1-tilec->x0;
-	l = tilec->numresolutions-1;
-	a = tilec->data;
-	
-	for (i = l - 1; i >= stop; i--) {
-		int rw;			/* width of the resolution level computed                                                           */
-		int rh;			/* heigth of the resolution level computed                                                          */
-		int rw1;		/* width of the resolution level once lower than computed one                                       */
-		int rh1;		/* height of the resolution level once lower than computed one                                      */
-		int cas_col;	/* 0 = non inversion on horizontal filtering 1 = inversion between low-pass and high-pass filtering */
-		int cas_row;	/* 0 = non inversion on vertical filtering 1 = inversion between low-pass and high-pass filtering   */
-		int dn, sn;
-		
-		rw = tilec->resolutions[l - i].x1 - tilec->resolutions[l - i].x0;
-		rh = tilec->resolutions[l - i].y1 - tilec->resolutions[l - i].y0;
-		rw1= tilec->resolutions[l - i - 1].x1 - tilec->resolutions[l - i - 1].x0;
-		rh1= tilec->resolutions[l - i - 1].y1 - tilec->resolutions[l - i - 1].y0;
-		
-		cas_row = tilec->resolutions[l - i].x0 % 2;
-		cas_col = tilec->resolutions[l - i].y0 % 2;
-		
-		sn = rw1;
-		dn = rw - rw1;
-		bj = (int*)opj_malloc(rw * sizeof(int));
-		for (j = 0; j < rh; j++) {
-			aj = a + j*w;
-			dwt_interleave_h(aj, bj, dn, sn, cas_row);
-			dwt_decode_1(bj, dn, sn, cas_row);
-			for (k = 0; k < rw; k++)  aj[k] = bj[k];
-		}
-		opj_free(bj);
-		
-		sn = rh1;
-		dn = rh - rh1;
-		bj = (int*)opj_malloc(rh * sizeof(int));
-		for (j = 0; j < rw; j++) {
-			aj = a + j;
-			dwt_interleave_v(aj, bj, dn, sn, w, cas_col);
-			dwt_decode_1(bj, dn, sn, cas_col);
-			for (k = 0; k < rh; k++)  aj[k * w] = bj[k];
-		}
-		opj_free(bj);
-	}
+	dwt_decode_tile(tilec, stop, &dwt_decode_1);
 }
 
 
@@ -522,55 +560,7 @@
 /* Inverse 9-7 wavelet transform in 2-D. */
 /* </summary>                            */
 void dwt_decode_real(opj_tcd_tilecomp_t * tilec, int stop) {
-	int i, j, k;
-	int *a = NULL;
-	int *aj = NULL;
-	int *bj = NULL;
-	int w, l;
-	
-	w = tilec->x1-tilec->x0;
-	l = tilec->numresolutions-1;
-	a = tilec->data;
-	
-	for (i = l-1; i >= stop; i--) {
-		int rw;			/* width of the resolution level computed                       */
-		int rh;			/* heigth of the resolution level computed                      */
-		int rw1;		/* width of the resolution level once lower than computed one   */
-		int rh1;		/* height of the resolution level once lower than computed one  */
-		int cas_col;	/* 0 = non inversion on horizontal filtering 1 = inversion between low-pass and high-pass filtering */
-		int cas_row;	/* 0 = non inversion on vertical filtering 1 = inversion between low-pass and high-pass filtering   */
-		int dn, sn;
-		
-		rw = tilec->resolutions[l - i].x1 - tilec->resolutions[l - i].x0;
-		rh = tilec->resolutions[l - i].y1 - tilec->resolutions[l - i].y0;
-		rw1= tilec->resolutions[l - i - 1].x1 - tilec->resolutions[l - i - 1].x0;
-		rh1= tilec->resolutions[l - i - 1].y1 - tilec->resolutions[l - i - 1].y0;
-		
-		cas_col = tilec->resolutions[l - i].x0 % 2; /* 0 = non inversion on horizontal filtering 1 = inversion between low-pass and high-pass filtering */
-		cas_row = tilec->resolutions[l - i].y0 % 2; /* 0 = non inversion on vertical filtering 1 = inversion between low-pass and high-pass filtering   */
-        
-		sn = rw1;
-		dn = rw-rw1;
-		bj = (int*)opj_malloc(rw * sizeof(int));
-		for (j = 0; j < rh; j++) {
-			aj = a + j * w;
-			dwt_interleave_h(aj, bj, dn, sn, cas_col);
-			dwt_decode_1_real(bj, dn, sn, cas_col);
-			for (k = 0; k < rw; k++)  aj[k] = bj[k];
-		}
-		opj_free(bj);
-		
-		sn = rh1;
-		dn = rh-rh1;
-		bj = (int*)opj_malloc(rh * sizeof(int));
-		for (j = 0; j < rw; j++) {
-			aj = a + j;
-			dwt_interleave_v(aj, bj, dn, sn, w, cas_row);
-			dwt_decode_1_real(bj, dn, sn, cas_row);
-			for (k = 0; k < rh; k++)  aj[k * w] = bj[k];
-		}
-		opj_free(bj);
-	}
+	dwt_decode_tile(tilec, stop, dwt_decode_1_real);
 }
 
 
@@ -609,3 +599,85 @@
 		dwt_encode_stepsize((int) floor(stepsize * 8192.0), prec + gain, &tccp->stepsizes[bandno]);
 	}
 }
+
+
+/* <summary>                             */
+/* Determine maximum computed resolution level for inverse wavelet transform */
+/* </summary>                            */
+static int dwt_decode_max_resolution(opj_tcd_resolution_t* r, int i) {
+	int mr	= 1;
+	int w;
+	while( --i ) {
+		r++;
+		if( mr < ( w = r->x1 - r->x0 ) )
+			mr = w ;
+		if( mr < ( w = r->y1 - r->y0 ) )
+			mr = w ;
+	}
+	return mr ;
+}
+
+
+/* <summary>                            */
+/* Inverse wavelet tranform in 2-D.     */
+/* </summary>                           */
+static void dwt_decode_tile(opj_tcd_tilecomp_t * tilec, int stop, DWT1DFN dwt_1D) {
+	opj_tcd_resolution_t* tr;
+	int i, j, k;
+	int *a = NULL;
+	int *aj = NULL;
+	int *m;
+	int w; //, l;
+	int rw;			/* width of the resolution level computed  */
+	int rh;			/* heigth of the resolution level computed  */
+	dwt_t h;
+	dwt_t v;
+	
+	if( 1 > ( i = tilec->numresolutions - stop ) )
+		return ;
+
+	tr = tilec->resolutions;
+
+	w = tilec->x1-tilec->x0;
+	a = tilec->data;
+
+	m = (int*)opj_malloc(sizeof(int) * (dwt_decode_max_resolution(tr, i)+5));
+	h.mem = v.mem = (int*)( (unsigned)m + 16 - ( (unsigned)m % 16 ) ) ;
+
+	rw = tr->x1 - tr->x0;
+	rh = tr->y1 - tr->y0;
+
+	while( --i ) {
+		tr++;
+		h.sn = rw;
+		v.sn = rh;
+		h.dn = ( rw = tr->x1 - tr->x0 ) - h.sn;
+		v.dn = ( rh = tr->y1 - tr->y0 ) - v.sn;
+				
+		h.cas = tr->x0 % 2;
+		v.cas = tr->y0 % 2;
+
+		aj = a;
+		j = rh;
+		while( j-- ) {		
+			dwt_interleave_h(&h, aj);
+			(dwt_1D)(&h);
+			k = rw;
+			while( k-- )
+				aj[k] = h.mem[k];
+			aj += w;
+		}
+
+		aj = a;
+		j = rw;
+		while( j-- ) {
+			dwt_interleave_v(&v, aj, w);
+			(dwt_1D)(&v);
+			k = rh;
+			while( k-- )
+				aj[k * w] = v.mem[k];
+			aj++;
+		}
+	}
+	opj_free(m);
+}


More information about the SLDev mailing list