[sldev] [patch] OpenJPEG DWT optimizations
Dzonatas
dzonatas at dzonux.net
Mon Apr 2 23:40:11 PDT 2007
Hello,
I believe this patch is ready. I have spent much time to test very
little changes to make sure the the output of this optimized version
matches the previous version found in the svn trunk on www.openjpeg.org.
I have not managed to obtain a collection of j2k files to test every
known case; however, I merged similar functions together to help test
different formats in a single run.
This version adds optimization, but it does not add GCC4 vector ops. I
wanted to make sure this patch can easily be compiled by other versions
and compilers. It still uses the same basic principles for fixed-point
integer math, but many redundant calculations have been moved around to
a more optimal place. There are more optimizations that can be done;
however, I traded those extra optimization steps for readability.
Results:
"ed_05000.jp2" is 652K in compressed form and 6MB uncompressed
decode of "ed_05000.jp2" without attached patch:
[INFO] - dwt took 2.750000 s
real 0m15.784s
user 0m5.550s
sys 0m10.340s
decode of "ed_05000.jp2" with attached patch
[INFO] - dwt took 1.870000 s
real 0m12.304s
user 0m6.570s
sys 0m5.610s
About 3.5 seconds faster with the patch.
"bird.jp2" is 156K in compressed form and 18MB uncompressed
decode of "bird.jp2" without attached patch:
[INFO] - dwt took 11.190000 s
real 0m38.438s
user 0m12.170s
sys 0m26.520s
decode of "bird.jp2" with attached patch
[INFO] - dwt took 10.350000 s
real 0m28.572s
user 0m9.590s
sys 0m18.030s
About 10 seconds faster with the patch.
In both of these cases, the "user" time deltas reflects the amount of
optimization in time relative to the DWT. The "sys" time deltas reflects
the amount of relief the optimization causes such as less cache hits,
page faults, context switches, etc.
The overall performance gain is estimated to about 30%.
I hope you like the attached patch. =)
--
-------------- next part --------------
Index: libopenjpeg/dwt.c
===================================================================
--- libopenjpeg/dwt.c (revision 369)
+++ libopenjpeg/dwt.c (working copy)
@@ -56,6 +56,24 @@
/** @defgroup DWT DWT - Implementation of a discrete wavelet transform */
/*@{*/
+/** @name Local data structures */
+/*@{*/
+
+typedef struct
+ {
+ int * mem ;
+ int dn ;
+ int sn ;
+ int cas ;
+ } dwt_t ;
+
+/*@}*/
+
+/**
+Virtual function type for wavelet transform in 1-D
+*/
+typedef void (*DWT1DFN)(dwt_t* v);
+
/** @name Local static functions */
/*@{*/
@@ -70,11 +88,11 @@
/**
Inverse lazy transform (horizontal)
*/
-static void dwt_interleave_h(int *a, int *b, int dn, int sn, int cas);
+static void dwt_interleave_h(dwt_t* h, int *a);
/**
Inverse lazy transform (vertical)
*/
-static void dwt_interleave_v(int *a, int *b, int dn, int sn, int x, int cas);
+static void dwt_interleave_v(dwt_t* v, int *a, int x);
/**
Forward 5-3 wavelet tranform in 1-D
*/
@@ -82,7 +100,7 @@
/**
Inverse 5-3 wavelet tranform in 1-D
*/
-static void dwt_decode_1(int *a, int dn, int sn, int cas);
+static void dwt_decode_1(dwt_t *v);
/**
Forward 9-7 wavelet transform in 1-D
*/
@@ -90,7 +108,7 @@
/**
Inverse 9-7 wavelet transform in 1-D
*/
-static void dwt_decode_1_real(int *a, int dn, int sn, int cas);
+static void dwt_decode_1_real(dwt_t *v);
/**
FIXME : comment ???
*/
@@ -95,6 +113,10 @@
FIXME : comment ???
*/
static void dwt_encode_stepsize(int stepsize, int numbps, opj_stepsize_t *bandno_stepsize);
+/**
+Inverse wavelet tranform in 2-D.
+*/
+static void dwt_decode_tile(opj_tcd_tilecomp_t * tilec, int stop , DWT1DFN fn);
/*@}*/
@@ -155,23 +177,20 @@
/* <summary> */
/* Inverse lazy transform (horizontal). */
/* </summary> */
-static void dwt_interleave_h(int *a, int *b, int dn, int sn, int cas) {
- int i;
- int *ai = NULL;
- int *bi = NULL;
- ai = a;
- bi = b + cas;
- for (i = 0; i < sn; i++) {
- *bi = *ai;
- bi += 2;
- ai++;
+static void dwt_interleave_h(dwt_t* h, int *a) {
+ int *ai = a;
+ int *bi = h->mem + h->cas;
+ int i = h->sn;
+ while( i-- ) {
+ *bi = *(ai++);
+ bi += 2;
}
- ai = a + sn;
- bi = b + 1 - cas;
- for (i = 0; i < dn; i++) {
- *bi = *ai;
+ ai = a + h->sn;
+ bi = h->mem + 1 - h->cas;
+ i = h->dn ;
+ while( i-- ) {
+ *bi = *(ai++);
bi += 2;
- ai++;
}
}
@@ -178,13 +197,11 @@
/* <summary> */
/* Inverse lazy transform (vertical). */
/* </summary> */
-static void dwt_interleave_v(int *a, int *b, int dn, int sn, int x, int cas) {
- int i;
- int *ai = NULL;
- int *bi = NULL;
- ai = a;
- bi = b + cas;
- for (i = 0; i < sn; i++) {
+static void dwt_interleave_v(dwt_t* v, int *a, int x) {
+ int *ai = a;
+ int *bi = v->mem + v->cas;
+ int i = v->sn;
+ while( i-- ) {
*bi = *ai;
bi += 2;
ai += x;
@@ -189,9 +206,10 @@
bi += 2;
ai += x;
}
- ai = a + (sn * x);
- bi = b + 1 - cas;
- for (i = 0; i < dn; i++) {
+ ai = a + (v->sn * x);
+ bi = v->mem + 1 - v->cas;
+ i = v->dn ;
+ while( i-- ) {
*bi = *ai;
bi += 2;
ai += x;
@@ -223,7 +241,7 @@
/* <summary> */
/* Inverse 5-3 wavelet tranform in 1-D. */
/* </summary> */
-static void dwt_decode_1(int *a, int dn, int sn, int cas) {
+static void dwt_decode_1_(int *a, int dn, int sn, int cas) {
int i;
if (!cas) {
@@ -241,6 +259,13 @@
}
}
+/* <summary> */
+/* Inverse 5-3 wavelet tranform in 1-D. */
+/* </summary> */
+static void dwt_decode_1(dwt_t *v) {
+ dwt_decode_1_(v->mem, v->dn, v->sn, v->cas);
+}
+
/* <summary> */
/* Forward 9-7 wavelet transform in 1-D. */
/* </summary> */
@@ -279,40 +304,101 @@
}
}
+#define WS(i) v->mem[(i)*2]
+#define WD(i) v->mem[(1+(i)*2)]
+
+static void dwt_decode_sm(dwt_t* v, int k, int n, int x) {
+ int m = k > n ? n : k;
+ int l = v->mem[1]; //D(0);
+ int j;
+ int i;
+ for (i = 0; i < m; i++) {
+ j = l;
+ WS(i) -= fix_mul( ( l = WD(i) ) + j , x);
+ }
+ if( i < k ) {
+ l = fix_mul( l + l , x );
+ for (; i < k; i++)
+ WS(i) -= l;
+ }
+}
+
+static void dwt_decode_sp(dwt_t* v, int k, int n, int x) {
+ int m = k > n ? n : k;
+ int l = v->mem[1]; //D(0);
+ int j;
+ int i;
+ for (i = 0; i < m; i++) {
+ j = l;
+ WS(i) += fix_mul( ( l = WD(i) ) + j , x);
+ }
+ if( i < k ) {
+ l = fix_mul( l + l , x );
+ for (; i < k; i++)
+ WS(i) += l;
+ }
+}
+
+static void dwt_decode_dm(dwt_t* v, int k, int n, int x) {
+ int m = k >= n ? n-1 : k;
+ int l = v->mem[0]; //S(0);
+ int i;
+ int j;
+ for (i = 0; i < m; i++) {
+ j = l;
+ WD(i) -= fix_mul( ( l = WS(i+1) ) + j , x);
+ }
+ if( i < k ) {
+ l = fix_mul( l + l , x );
+ for (; i < k; i++)
+ WD(i) -= l;
+ }
+}
+
+static void dwt_decode_dp(dwt_t* v, int k, int n, int x) {
+ int m = k >= n ? n-1 : k;
+ int l = v->mem[0]; //S(0);
+ int i;
+ int j;
+ for (i = 0; i < m; i++) {
+ j = l;
+ WD(i) += fix_mul( ( l = WS(i+1) ) + j , x);
+ }
+
+ if( i < k ) {
+ l = fix_mul( l + l , x );
+ for (; i < k; i++)
+ WD(i) += l;
+ }
+}
+
+
/* <summary> */
/* Inverse 9-7 wavelet transform in 1-D. */
/* </summary> */
-static void dwt_decode_1_real(int *a, int dn, int sn, int cas) {
+static void dwt_decode_1_real(dwt_t* v) {
int i;
- if (!cas) {
- if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */
- for (i = 0; i < sn; i++)
- S(i) = fix_mul(S(i), 10078); /* 10076 */
- for (i = 0; i < dn; i++)
- D(i) = fix_mul(D(i), 13318); /* 13320 */
- for (i = 0; i < sn; i++)
- S(i) -= fix_mul(D_(i - 1) + D_(i), 3633);
- for (i = 0; i < dn; i++)
- D(i) -= fix_mul(S_(i) + S_(i + 1), 7233);
- for (i = 0; i < sn; i++)
- S(i) += fix_mul(D_(i - 1) + D_(i), 434);
- for (i = 0; i < dn; i++)
- D(i) += fix_mul(S_(i) + S_(i + 1), 12994); /* 12993 */
+ if (!v->cas) {
+ if ((v->dn > 0) || (v->sn > 1)) { /* NEW : CASE ONE ELEMENT */
+ for (i = 0; i < v->sn; i++)
+ WS(i) = fix_mul(WS(i), 10078); /* 10076 */
+ for (i = 0; i < v->dn; i++)
+ WD(i) = fix_mul(WD(i), 13318); /* 13320 */
+ dwt_decode_sm(v, v->sn, v->dn, 3633);
+ dwt_decode_dm(v, v->dn, v->sn, 7233);
+ dwt_decode_sp(v, v->sn, v->dn, 434);
+ dwt_decode_dp(v, v->dn, v->sn, 12994);
}
} else {
- if ((sn > 0) || (dn > 1)) { /* NEW : CASE ONE ELEMENT */
- for (i = 0; i < sn; i++)
- D(i) = fix_mul(D(i), 10078); /* 10076 */
- for (i = 0; i < dn; i++)
- S(i) = fix_mul(S(i), 13318); /* 13320 */
- for (i = 0; i < sn; i++)
- D(i) -= fix_mul(SS_(i) + SS_(i + 1), 3633);
- for (i = 0; i < dn; i++)
- S(i) -= fix_mul(DD_(i) + DD_(i - 1), 7233);
- for (i = 0; i < sn; i++)
- D(i) += fix_mul(SS_(i) + SS_(i + 1), 434);
- for (i = 0; i < dn; i++)
- S(i) += fix_mul(DD_(i) + DD_(i - 1), 12994); /* 12993 */
+ if ((v->sn > 0) || (v->dn > 1)) { /* NEW : CASE ONE ELEMENT */
+ for (i = 0; i < v->sn; i++)
+ WD(i) = fix_mul(WD(i), 10078); /* 10076 */
+ for (i = 0; i < v->dn; i++)
+ WS(i) = fix_mul(WS(i), 13318); /* 13320 */
+ dwt_decode_dm(v, v->sn, v->dn, 3633);
+ dwt_decode_sm(v, v->dn, v->sn, 7233);
+ dwt_decode_dp(v, v->sn, v->dn, 434);
+ dwt_decode_sp(v, v->dn, v->sn, 12994);
}
}
}
@@ -391,55 +477,7 @@
/* Inverse 5-3 wavelet tranform in 2-D. */
/* </summary> */
void dwt_decode(opj_tcd_tilecomp_t * tilec, int stop) {
- int i, j, k;
- int *a = NULL;
- int *aj = NULL;
- int *bj = NULL;
- int w, l;
-
- w = tilec->x1-tilec->x0;
- l = tilec->numresolutions-1;
- a = tilec->data;
-
- for (i = l - 1; i >= stop; i--) {
- int rw; /* width of the resolution level computed */
- int rh; /* heigth of the resolution level computed */
- int rw1; /* width of the resolution level once lower than computed one */
- int rh1; /* height of the resolution level once lower than computed one */
- int cas_col; /* 0 = non inversion on horizontal filtering 1 = inversion between low-pass and high-pass filtering */
- int cas_row; /* 0 = non inversion on vertical filtering 1 = inversion between low-pass and high-pass filtering */
- int dn, sn;
-
- rw = tilec->resolutions[l - i].x1 - tilec->resolutions[l - i].x0;
- rh = tilec->resolutions[l - i].y1 - tilec->resolutions[l - i].y0;
- rw1= tilec->resolutions[l - i - 1].x1 - tilec->resolutions[l - i - 1].x0;
- rh1= tilec->resolutions[l - i - 1].y1 - tilec->resolutions[l - i - 1].y0;
-
- cas_row = tilec->resolutions[l - i].x0 % 2;
- cas_col = tilec->resolutions[l - i].y0 % 2;
-
- sn = rw1;
- dn = rw - rw1;
- bj = (int*)opj_malloc(rw * sizeof(int));
- for (j = 0; j < rh; j++) {
- aj = a + j*w;
- dwt_interleave_h(aj, bj, dn, sn, cas_row);
- dwt_decode_1(bj, dn, sn, cas_row);
- for (k = 0; k < rw; k++) aj[k] = bj[k];
- }
- opj_free(bj);
-
- sn = rh1;
- dn = rh - rh1;
- bj = (int*)opj_malloc(rh * sizeof(int));
- for (j = 0; j < rw; j++) {
- aj = a + j;
- dwt_interleave_v(aj, bj, dn, sn, w, cas_col);
- dwt_decode_1(bj, dn, sn, cas_col);
- for (k = 0; k < rh; k++) aj[k * w] = bj[k];
- }
- opj_free(bj);
- }
+ dwt_decode_tile(tilec, stop, &dwt_decode_1);
}
@@ -522,55 +560,7 @@
/* Inverse 9-7 wavelet transform in 2-D. */
/* </summary> */
void dwt_decode_real(opj_tcd_tilecomp_t * tilec, int stop) {
- int i, j, k;
- int *a = NULL;
- int *aj = NULL;
- int *bj = NULL;
- int w, l;
-
- w = tilec->x1-tilec->x0;
- l = tilec->numresolutions-1;
- a = tilec->data;
-
- for (i = l-1; i >= stop; i--) {
- int rw; /* width of the resolution level computed */
- int rh; /* heigth of the resolution level computed */
- int rw1; /* width of the resolution level once lower than computed one */
- int rh1; /* height of the resolution level once lower than computed one */
- int cas_col; /* 0 = non inversion on horizontal filtering 1 = inversion between low-pass and high-pass filtering */
- int cas_row; /* 0 = non inversion on vertical filtering 1 = inversion between low-pass and high-pass filtering */
- int dn, sn;
-
- rw = tilec->resolutions[l - i].x1 - tilec->resolutions[l - i].x0;
- rh = tilec->resolutions[l - i].y1 - tilec->resolutions[l - i].y0;
- rw1= tilec->resolutions[l - i - 1].x1 - tilec->resolutions[l - i - 1].x0;
- rh1= tilec->resolutions[l - i - 1].y1 - tilec->resolutions[l - i - 1].y0;
-
- cas_col = tilec->resolutions[l - i].x0 % 2; /* 0 = non inversion on horizontal filtering 1 = inversion between low-pass and high-pass filtering */
- cas_row = tilec->resolutions[l - i].y0 % 2; /* 0 = non inversion on vertical filtering 1 = inversion between low-pass and high-pass filtering */
-
- sn = rw1;
- dn = rw-rw1;
- bj = (int*)opj_malloc(rw * sizeof(int));
- for (j = 0; j < rh; j++) {
- aj = a + j * w;
- dwt_interleave_h(aj, bj, dn, sn, cas_col);
- dwt_decode_1_real(bj, dn, sn, cas_col);
- for (k = 0; k < rw; k++) aj[k] = bj[k];
- }
- opj_free(bj);
-
- sn = rh1;
- dn = rh-rh1;
- bj = (int*)opj_malloc(rh * sizeof(int));
- for (j = 0; j < rw; j++) {
- aj = a + j;
- dwt_interleave_v(aj, bj, dn, sn, w, cas_row);
- dwt_decode_1_real(bj, dn, sn, cas_row);
- for (k = 0; k < rh; k++) aj[k * w] = bj[k];
- }
- opj_free(bj);
- }
+ dwt_decode_tile(tilec, stop, dwt_decode_1_real);
}
@@ -609,3 +599,85 @@
dwt_encode_stepsize((int) floor(stepsize * 8192.0), prec + gain, &tccp->stepsizes[bandno]);
}
}
+
+
+/* <summary> */
+/* Determine maximum computed resolution level for inverse wavelet transform */
+/* </summary> */
+static int dwt_decode_max_resolution(opj_tcd_resolution_t* r, int i) {
+ int mr = 1;
+ int w;
+ while( --i ) {
+ r++;
+ if( mr < ( w = r->x1 - r->x0 ) )
+ mr = w ;
+ if( mr < ( w = r->y1 - r->y0 ) )
+ mr = w ;
+ }
+ return mr ;
+}
+
+
+/* <summary> */
+/* Inverse wavelet tranform in 2-D. */
+/* </summary> */
+static void dwt_decode_tile(opj_tcd_tilecomp_t * tilec, int stop, DWT1DFN dwt_1D) {
+ opj_tcd_resolution_t* tr;
+ int i, j, k;
+ int *a = NULL;
+ int *aj = NULL;
+ int *m;
+ int w; //, l;
+ int rw; /* width of the resolution level computed */
+ int rh; /* heigth of the resolution level computed */
+ dwt_t h;
+ dwt_t v;
+
+ if( 1 > ( i = tilec->numresolutions - stop ) )
+ return ;
+
+ tr = tilec->resolutions;
+
+ w = tilec->x1-tilec->x0;
+ a = tilec->data;
+
+ m = (int*)opj_malloc(sizeof(int) * (dwt_decode_max_resolution(tr, i)+5));
+ h.mem = v.mem = (int*)( (unsigned)m + 16 - ( (unsigned)m % 16 ) ) ;
+
+ rw = tr->x1 - tr->x0;
+ rh = tr->y1 - tr->y0;
+
+ while( --i ) {
+ tr++;
+ h.sn = rw;
+ v.sn = rh;
+ h.dn = ( rw = tr->x1 - tr->x0 ) - h.sn;
+ v.dn = ( rh = tr->y1 - tr->y0 ) - v.sn;
+
+ h.cas = tr->x0 % 2;
+ v.cas = tr->y0 % 2;
+
+ aj = a;
+ j = rh;
+ while( j-- ) {
+ dwt_interleave_h(&h, aj);
+ (dwt_1D)(&h);
+ k = rw;
+ while( k-- )
+ aj[k] = h.mem[k];
+ aj += w;
+ }
+
+ aj = a;
+ j = rw;
+ while( j-- ) {
+ dwt_interleave_v(&v, aj, w);
+ (dwt_1D)(&v);
+ k = rh;
+ while( k-- )
+ aj[k * w] = v.mem[k];
+ aj++;
+ }
+ }
+ opj_free(m);
+}
More information about the SLDev
mailing list