[sldev] [patch] OpenJPEG DWT optimizations
Francois-Olivier Devaux
fodprolux at gmail.com
Tue Apr 3 23:14:00 PDT 2007
Hi Dzonatas,
I went quickly through the patch, did some quick testing and it seems to
work great !
Great speeding improvement !
We'll have to do some more throughout testing with some strange images sizes
to test it deeply, but it seems the code will be integrated in the next
OpenJPEG version.
Great job,
François-Olivier
On 4/3/07, Dzonatas <dzonatas at dzonux.net> wrote:
>
> Hello,
>
> I believe this patch is ready. I have spent much time to test very
> little changes to make sure the the output of this optimized version
> matches the previous version found in the svn trunk on www.openjpeg.org.
> I have not managed to obtain a collection of j2k files to test every
> known case; however, I merged similar functions together to help test
> different formats in a single run.
>
> This version adds optimization, but it does not add GCC4 vector ops. I
> wanted to make sure this patch can easily be compiled by other versions
> and compilers. It still uses the same basic principles for fixed-point
> integer math, but many redundant calculations have been moved around to
> a more optimal place. There are more optimizations that can be done;
> however, I traded those extra optimization steps for readability.
>
> Results:
>
> "ed_05000.jp2" is 652K in compressed form and 6MB uncompressed
>
> decode of "ed_05000.jp2" without attached patch:
> [INFO] - dwt took 2.750000 s
> real 0m15.784s
> user 0m5.550s
> sys 0m10.340s
>
> decode of "ed_05000.jp2" with attached patch
> [INFO] - dwt took 1.870000 s
> real 0m12.304s
> user 0m6.570s
> sys 0m5.610s
>
> About 3.5 seconds faster with the patch.
>
> "bird.jp2" is 156K in compressed form and 18MB uncompressed
>
> decode of "bird.jp2" without attached patch:
> [INFO] - dwt took 11.190000 s
> real 0m38.438s
> user 0m12.170s
> sys 0m26.520s
>
> decode of "bird.jp2" with attached patch
> [INFO] - dwt took 10.350000 s
> real 0m28.572s
> user 0m9.590s
> sys 0m18.030s
>
> About 10 seconds faster with the patch.
>
> In both of these cases, the "user" time deltas reflects the amount of
> optimization in time relative to the DWT. The "sys" time deltas reflects
> the amount of relief the optimization causes such as less cache hits,
> page faults, context switches, etc.
>
> The overall performance gain is estimated to about 30%.
>
> I hope you like the attached patch. =)
>
> --
>
> Index: libopenjpeg/dwt.c
> ===================================================================
> --- libopenjpeg/dwt.c (revision 369)
> +++ libopenjpeg/dwt.c (working copy)
> @@ -56,6 +56,24 @@
> /** @defgroup DWT DWT - Implementation of a discrete wavelet transform */
> /*@{*/
>
> +/** @name Local data structures */
> +/*@{*/
> +
> +typedef struct
> + {
> + int * mem ;
> + int dn ;
> + int sn ;
> + int cas ;
> + } dwt_t ;
> +
> +/*@}*/
> +
> +/**
> +Virtual function type for wavelet transform in 1-D
> +*/
> +typedef void (*DWT1DFN)(dwt_t* v);
> +
> /** @name Local static functions */
> /*@{*/
>
> @@ -70,11 +88,11 @@
> /**
> Inverse lazy transform (horizontal)
> */
> -static void dwt_interleave_h(int *a, int *b, int dn, int sn, int cas);
> +static void dwt_interleave_h(dwt_t* h, int *a);
> /**
> Inverse lazy transform (vertical)
> */
> -static void dwt_interleave_v(int *a, int *b, int dn, int sn, int x, int
> cas);
> +static void dwt_interleave_v(dwt_t* v, int *a, int x);
> /**
> Forward 5-3 wavelet tranform in 1-D
> */
> @@ -82,7 +100,7 @@
> /**
> Inverse 5-3 wavelet tranform in 1-D
> */
> -static void dwt_decode_1(int *a, int dn, int sn, int cas);
> +static void dwt_decode_1(dwt_t *v);
> /**
> Forward 9-7 wavelet transform in 1-D
> */
> @@ -90,7 +108,7 @@
> /**
> Inverse 9-7 wavelet transform in 1-D
> */
> -static void dwt_decode_1_real(int *a, int dn, int sn, int cas);
> +static void dwt_decode_1_real(dwt_t *v);
> /**
> FIXME : comment ???
> */
> @@ -95,6 +113,10 @@
> FIXME : comment ???
> */
> static void dwt_encode_stepsize(int stepsize, int numbps, opj_stepsize_t
> *bandno_stepsize);
> +/**
> +Inverse wavelet tranform in 2-D.
> +*/
> +static void dwt_decode_tile(opj_tcd_tilecomp_t * tilec, int stop ,
> DWT1DFN fn);
>
> /*@}*/
>
> @@ -155,23 +177,20 @@
> /* <summary> */
> /* Inverse lazy transform (horizontal). */
> /* </summary> */
> -static void dwt_interleave_h(int *a, int *b, int dn, int sn, int cas) {
> - int i;
> - int *ai = NULL;
> - int *bi = NULL;
> - ai = a;
> - bi = b + cas;
> - for (i = 0; i < sn; i++) {
> - *bi = *ai;
> - bi += 2;
> - ai++;
> +static void dwt_interleave_h(dwt_t* h, int *a) {
> + int *ai = a;
> + int *bi = h->mem + h->cas;
> + int i = h->sn;
> + while( i-- ) {
> + *bi = *(ai++);
> + bi += 2;
> }
> - ai = a + sn;
> - bi = b + 1 - cas;
> - for (i = 0; i < dn; i++) {
> - *bi = *ai;
> + ai = a + h->sn;
> + bi = h->mem + 1 - h->cas;
> + i = h->dn ;
> + while( i-- ) {
> + *bi = *(ai++);
> bi += 2;
> - ai++;
> }
> }
>
> @@ -178,13 +197,11 @@
> /* <summary> */
> /* Inverse lazy transform (vertical). */
> /* </summary> */
> -static void dwt_interleave_v(int *a, int *b, int dn, int sn, int x, int
> cas) {
> - int i;
> - int *ai = NULL;
> - int *bi = NULL;
> - ai = a;
> - bi = b + cas;
> - for (i = 0; i < sn; i++) {
> +static void dwt_interleave_v(dwt_t* v, int *a, int x) {
> + int *ai = a;
> + int *bi = v->mem + v->cas;
> + int i = v->sn;
> + while( i-- ) {
> *bi = *ai;
> bi += 2;
> ai += x;
> @@ -189,9 +206,10 @@
> bi += 2;
> ai += x;
> }
> - ai = a + (sn * x);
> - bi = b + 1 - cas;
> - for (i = 0; i < dn; i++) {
> + ai = a + (v->sn * x);
> + bi = v->mem + 1 - v->cas;
> + i = v->dn ;
> + while( i-- ) {
> *bi = *ai;
> bi += 2;
> ai += x;
> @@ -223,7 +241,7 @@
> /* <summary> */
> /* Inverse 5-3 wavelet tranform in 1-D. */
> /* </summary> */
> -static void dwt_decode_1(int *a, int dn, int sn, int cas) {
> +static void dwt_decode_1_(int *a, int dn, int sn, int cas) {
> int i;
>
> if (!cas) {
> @@ -241,6 +259,13 @@
> }
> }
>
> +/* <summary> */
> +/* Inverse 5-3 wavelet tranform in 1-D. */
> +/* </summary> */
> +static void dwt_decode_1(dwt_t *v) {
> + dwt_decode_1_(v->mem, v->dn, v->sn, v->cas);
> +}
> +
> /* <summary> */
> /* Forward 9-7 wavelet transform in 1-D. */
> /* </summary> */
> @@ -279,40 +304,101 @@
> }
> }
>
> +#define WS(i) v->mem[(i)*2]
> +#define WD(i) v->mem[(1+(i)*2)]
> +
> +static void dwt_decode_sm(dwt_t* v, int k, int n, int x) {
> + int m = k > n ? n : k;
> + int l = v->mem[1]; //D(0);
> + int j;
> + int i;
> + for (i = 0; i < m; i++) {
> + j = l;
> + WS(i) -= fix_mul( ( l = WD(i) ) + j , x);
> + }
> + if( i < k ) {
> + l = fix_mul( l + l , x );
> + for (; i < k; i++)
> + WS(i) -= l;
> + }
> +}
> +
> +static void dwt_decode_sp(dwt_t* v, int k, int n, int x) {
> + int m = k > n ? n : k;
> + int l = v->mem[1]; //D(0);
> + int j;
> + int i;
> + for (i = 0; i < m; i++) {
> + j = l;
> + WS(i) += fix_mul( ( l = WD(i) ) + j , x);
> + }
> + if( i < k ) {
> + l = fix_mul( l + l , x );
> + for (; i < k; i++)
> + WS(i) += l;
> + }
> +}
> +
> +static void dwt_decode_dm(dwt_t* v, int k, int n, int x) {
> + int m = k >= n ? n-1 : k;
> + int l = v->mem[0]; //S(0);
> + int i;
> + int j;
> + for (i = 0; i < m; i++) {
> + j = l;
> + WD(i) -= fix_mul( ( l = WS(i+1) ) + j , x);
> + }
> + if( i < k ) {
> + l = fix_mul( l + l , x );
> + for (; i < k; i++)
> + WD(i) -= l;
> + }
> +}
> +
> +static void dwt_decode_dp(dwt_t* v, int k, int n, int x) {
> + int m = k >= n ? n-1 : k;
> + int l = v->mem[0]; //S(0);
> + int i;
> + int j;
> + for (i = 0; i < m; i++) {
> + j = l;
> + WD(i) += fix_mul( ( l = WS(i+1) ) + j , x);
> + }
> +
> + if( i < k ) {
> + l = fix_mul( l + l , x );
> + for (; i < k; i++)
> + WD(i) += l;
> + }
> +}
> +
> +
> /* <summary> */
> /* Inverse 9-7 wavelet transform in 1-D. */
> /* </summary> */
> -static void dwt_decode_1_real(int *a, int dn, int sn, int cas) {
> +static void dwt_decode_1_real(dwt_t* v) {
> int i;
> - if (!cas) {
> - if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT
> */
> - for (i = 0; i < sn; i++)
> - S(i) = fix_mul(S(i), 10078); /* 10076
> */
> - for (i = 0; i < dn; i++)
> - D(i) = fix_mul(D(i), 13318); /* 13320
> */
> - for (i = 0; i < sn; i++)
> - S(i) -= fix_mul(D_(i - 1) + D_(i), 3633);
> - for (i = 0; i < dn; i++)
> - D(i) -= fix_mul(S_(i) + S_(i + 1), 7233);
> - for (i = 0; i < sn; i++)
> - S(i) += fix_mul(D_(i - 1) + D_(i), 434);
> - for (i = 0; i < dn; i++)
> - D(i) += fix_mul(S_(i) + S_(i + 1),
> 12994); /* 12993 */
> + if (!v->cas) {
> + if ((v->dn > 0) || (v->sn > 1)) { /* NEW : CASE ONE
> ELEMENT */
> + for (i = 0; i < v->sn; i++)
> + WS(i) = fix_mul(WS(i), 10078); /* 10076
> */
> + for (i = 0; i < v->dn; i++)
> + WD(i) = fix_mul(WD(i), 13318); /* 13320
> */
> + dwt_decode_sm(v, v->sn, v->dn, 3633);
> + dwt_decode_dm(v, v->dn, v->sn, 7233);
> + dwt_decode_sp(v, v->sn, v->dn, 434);
> + dwt_decode_dp(v, v->dn, v->sn, 12994);
> }
> } else {
> - if ((sn > 0) || (dn > 1)) { /* NEW : CASE ONE ELEMENT
> */
> - for (i = 0; i < sn; i++)
> - D(i) = fix_mul(D(i), 10078); /* 10076
> */
> - for (i = 0; i < dn; i++)
> - S(i) = fix_mul(S(i), 13318); /* 13320
> */
> - for (i = 0; i < sn; i++)
> - D(i) -= fix_mul(SS_(i) + SS_(i + 1),
> 3633);
> - for (i = 0; i < dn; i++)
> - S(i) -= fix_mul(DD_(i) + DD_(i - 1),
> 7233);
> - for (i = 0; i < sn; i++)
> - D(i) += fix_mul(SS_(i) + SS_(i + 1), 434);
> - for (i = 0; i < dn; i++)
> - S(i) += fix_mul(DD_(i) + DD_(i - 1),
> 12994); /* 12993 */
> + if ((v->sn > 0) || (v->dn > 1)) { /* NEW : CASE ONE
> ELEMENT */
> + for (i = 0; i < v->sn; i++)
> + WD(i) = fix_mul(WD(i), 10078); /* 10076
> */
> + for (i = 0; i < v->dn; i++)
> + WS(i) = fix_mul(WS(i), 13318); /* 13320
> */
> + dwt_decode_dm(v, v->sn, v->dn, 3633);
> + dwt_decode_sm(v, v->dn, v->sn, 7233);
> + dwt_decode_dp(v, v->sn, v->dn, 434);
> + dwt_decode_sp(v, v->dn, v->sn, 12994);
> }
> }
> }
> @@ -391,55 +477,7 @@
> /* Inverse 5-3 wavelet tranform in 2-D. */
> /* </summary> */
> void dwt_decode(opj_tcd_tilecomp_t * tilec, int stop) {
> - int i, j, k;
> - int *a = NULL;
> - int *aj = NULL;
> - int *bj = NULL;
> - int w, l;
> -
> - w = tilec->x1-tilec->x0;
> - l = tilec->numresolutions-1;
> - a = tilec->data;
> -
> - for (i = l - 1; i >= stop; i--) {
> - int rw; /* width of the resolution level
> computed */
> - int rh; /* heigth of the resolution level
> computed */
> - int rw1; /* width of the resolution level
> once lower than computed one */
> - int rh1; /* height of the resolution level
> once lower than computed one */
> - int cas_col; /* 0 = non inversion on horizontal
> filtering 1 = inversion between low-pass and high-pass filtering */
> - int cas_row; /* 0 = non inversion on vertical filtering
> 1 = inversion between low-pass and high-pass filtering */
> - int dn, sn;
> -
> - rw = tilec->resolutions[l - i].x1 - tilec->resolutions[l -
> i].x0;
> - rh = tilec->resolutions[l - i].y1 - tilec->resolutions[l -
> i].y0;
> - rw1= tilec->resolutions[l - i - 1].x1 -
> tilec->resolutions[l - i - 1].x0;
> - rh1= tilec->resolutions[l - i - 1].y1 -
> tilec->resolutions[l - i - 1].y0;
> -
> - cas_row = tilec->resolutions[l - i].x0 % 2;
> - cas_col = tilec->resolutions[l - i].y0 % 2;
> -
> - sn = rw1;
> - dn = rw - rw1;
> - bj = (int*)opj_malloc(rw * sizeof(int));
> - for (j = 0; j < rh; j++) {
> - aj = a + j*w;
> - dwt_interleave_h(aj, bj, dn, sn, cas_row);
> - dwt_decode_1(bj, dn, sn, cas_row);
> - for (k = 0; k < rw; k++) aj[k] = bj[k];
> - }
> - opj_free(bj);
> -
> - sn = rh1;
> - dn = rh - rh1;
> - bj = (int*)opj_malloc(rh * sizeof(int));
> - for (j = 0; j < rw; j++) {
> - aj = a + j;
> - dwt_interleave_v(aj, bj, dn, sn, w, cas_col);
> - dwt_decode_1(bj, dn, sn, cas_col);
> - for (k = 0; k < rh; k++) aj[k * w] = bj[k];
> - }
> - opj_free(bj);
> - }
> + dwt_decode_tile(tilec, stop, &dwt_decode_1);
> }
>
>
> @@ -522,55 +560,7 @@
> /* Inverse 9-7 wavelet transform in 2-D. */
> /* </summary> */
> void dwt_decode_real(opj_tcd_tilecomp_t * tilec, int stop) {
> - int i, j, k;
> - int *a = NULL;
> - int *aj = NULL;
> - int *bj = NULL;
> - int w, l;
> -
> - w = tilec->x1-tilec->x0;
> - l = tilec->numresolutions-1;
> - a = tilec->data;
> -
> - for (i = l-1; i >= stop; i--) {
> - int rw; /* width of the resolution level
> computed */
> - int rh; /* heigth of the resolution level
> computed */
> - int rw1; /* width of the resolution level
> once lower than computed one */
> - int rh1; /* height of the resolution level
> once lower than computed one */
> - int cas_col; /* 0 = non inversion on horizontal
> filtering 1 = inversion between low-pass and high-pass filtering */
> - int cas_row; /* 0 = non inversion on vertical filtering
> 1 = inversion between low-pass and high-pass filtering */
> - int dn, sn;
> -
> - rw = tilec->resolutions[l - i].x1 - tilec->resolutions[l -
> i].x0;
> - rh = tilec->resolutions[l - i].y1 - tilec->resolutions[l -
> i].y0;
> - rw1= tilec->resolutions[l - i - 1].x1 -
> tilec->resolutions[l - i - 1].x0;
> - rh1= tilec->resolutions[l - i - 1].y1 -
> tilec->resolutions[l - i - 1].y0;
> -
> - cas_col = tilec->resolutions[l - i].x0 % 2; /* 0 = non
> inversion on horizontal filtering 1 = inversion between low-pass and
> high-pass filtering */
> - cas_row = tilec->resolutions[l - i].y0 % 2; /* 0 = non
> inversion on vertical filtering 1 = inversion between low-pass and high-pass
> filtering */
> -
> - sn = rw1;
> - dn = rw-rw1;
> - bj = (int*)opj_malloc(rw * sizeof(int));
> - for (j = 0; j < rh; j++) {
> - aj = a + j * w;
> - dwt_interleave_h(aj, bj, dn, sn, cas_col);
> - dwt_decode_1_real(bj, dn, sn, cas_col);
> - for (k = 0; k < rw; k++) aj[k] = bj[k];
> - }
> - opj_free(bj);
> -
> - sn = rh1;
> - dn = rh-rh1;
> - bj = (int*)opj_malloc(rh * sizeof(int));
> - for (j = 0; j < rw; j++) {
> - aj = a + j;
> - dwt_interleave_v(aj, bj, dn, sn, w, cas_row);
> - dwt_decode_1_real(bj, dn, sn, cas_row);
> - for (k = 0; k < rh; k++) aj[k * w] = bj[k];
> - }
> - opj_free(bj);
> - }
> + dwt_decode_tile(tilec, stop, dwt_decode_1_real);
> }
>
>
> @@ -609,3 +599,85 @@
> dwt_encode_stepsize((int) floor(stepsize * 8192.0), prec +
> gain, &tccp->stepsizes[bandno]);
> }
> }
> +
> +
> +/* <summary> */
> +/* Determine maximum computed resolution level for inverse wavelet
> transform */
> +/* </summary> */
> +static int dwt_decode_max_resolution(opj_tcd_resolution_t* r, int i) {
> + int mr = 1;
> + int w;
> + while( --i ) {
> + r++;
> + if( mr < ( w = r->x1 - r->x0 ) )
> + mr = w ;
> + if( mr < ( w = r->y1 - r->y0 ) )
> + mr = w ;
> + }
> + return mr ;
> +}
> +
> +
> +/* <summary> */
> +/* Inverse wavelet tranform in 2-D. */
> +/* </summary> */
> +static void dwt_decode_tile(opj_tcd_tilecomp_t * tilec, int stop, DWT1DFN
> dwt_1D) {
> + opj_tcd_resolution_t* tr;
> + int i, j, k;
> + int *a = NULL;
> + int *aj = NULL;
> + int *m;
> + int w; //, l;
> + int rw; /* width of the resolution level
> computed */
> + int rh; /* heigth of the resolution level
> computed */
> + dwt_t h;
> + dwt_t v;
> +
> + if( 1 > ( i = tilec->numresolutions - stop ) )
> + return ;
> +
> + tr = tilec->resolutions;
> +
> + w = tilec->x1-tilec->x0;
> + a = tilec->data;
> +
> + m = (int*)opj_malloc(sizeof(int) * (dwt_decode_max_resolution(tr,
> i)+5));
> + h.mem = v.mem = (int*)( (unsigned)m + 16 - ( (unsigned)m % 16 ) )
> ;
> +
> + rw = tr->x1 - tr->x0;
> + rh = tr->y1 - tr->y0;
> +
> + while( --i ) {
> + tr++;
> + h.sn = rw;
> + v.sn = rh;
> + h.dn = ( rw = tr->x1 - tr->x0 ) - h.sn;
> + v.dn = ( rh = tr->y1 - tr->y0 ) - v.sn;
> +
> + h.cas = tr->x0 % 2;
> + v.cas = tr->y0 % 2;
> +
> + aj = a;
> + j = rh;
> + while( j-- ) {
> + dwt_interleave_h(&h, aj);
> + (dwt_1D)(&h);
> + k = rw;
> + while( k-- )
> + aj[k] = h.mem[k];
> + aj += w;
> + }
> +
> + aj = a;
> + j = rw;
> + while( j-- ) {
> + dwt_interleave_v(&v, aj, w);
> + (dwt_1D)(&v);
> + k = rh;
> + while( k-- )
> + aj[k * w] = v.mem[k];
> + aj++;
> + }
> + }
> + opj_free(m);
> +}
>
> _______________________________________________
> Click here to unsubscribe or manage your list subscription:
> /index.html
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://lists.secondlife.com/pipermail/sldev/attachments/20070404/fbdc6ae5/attachment-0001.htm
More information about the SLDev
mailing list