[sldev] [patch] Vectorized DWT in OpenJPEG
Dzonatas
dzonatas at dzonux.net
Sat Apr 7 20:21:42 PDT 2007
Hello,
Attached is the patch to move the DWT into vectorized code. It works
with floating point, so I only enabled it for the 9-7 decode.
Speed wise, it is just slightly slower than KDU. One my reference file
"bird.jp2," KDU decodes in 2 seconds DWT time, and this patch will make
OpenJPEG decode the same in under 4 seconds in DWT time.
Actual speed-up will vary overall, but it ranges from 2x to 20x in just
DWT time.
Enjoy. =)
--
-------------- next part --------------
Index: libopenjpeg/dwt.c
===================================================================
--- libopenjpeg/dwt.c (revision 372)
+++ libopenjpeg/dwt.c (working copy)
@@ -38,6 +38,11 @@
#define WS(i) v->mem[(i)*2]
#define WD(i) v->mem[(1+(i)*2)]
+#ifdef __GNUC__
+#define DWT_FP_VECTORIZATION
+typedef float v4sf __attribute__ ((vector_size (16))) ;
+#endif
+
/** @name Local data structures */
/*@{*/
@@ -48,6 +53,32 @@
int cas ;
} dwt_t ;
+#ifdef DWT_FP_VECTORIZATION
+
+typedef union {
+ float f[4] ;
+ v4sf v ;
+ } v4 ;
+
+typedef struct v4dwt_local {
+ v4 * wavelet ;
+ int dn ;
+ int sn ;
+ int cas ;
+ } v4dwt_t ;
+
+#define PF( x ) { x / 8192.0f , x / 8192.0f , x / 8192.0f , x / 8192.0f } ;
+
+const v4 v10078 = PF( 10078.0f ) ;
+const v4 v13318 = PF( 13318.0f ) ;
+const v4 v3633 = PF( -3633.0f ) ;
+const v4 v7233 = PF( -7233.0f ) ;
+const v4 v434 = PF( 434.0f ) ;
+const v4 v12994 = PF( 12994.0f ) ;
+
+#endif
+
+
/*@}*/
/**
@@ -95,9 +126,13 @@
*/
static void dwt_encode_stepsize(int stepsize, int numbps, opj_stepsize_t *bandno_stepsize);
/**
-Inverse wavelet tranform in 2-D.
+Inverse wavelet tranform in 2-D. (fixed point)
*/
static void dwt_decode_tile(opj_tcd_tilecomp_t * tilec, int stop , DWT1DFN fn);
+/**
+Inverse wavelet tranform in 2-D. (floating point)
+*/
+static void dwt_decode_tile_fp(opj_tcd_tilecomp_t * tilec, int stop);
/*@}*/
@@ -287,66 +322,78 @@
static void dwt_decode_sm(dwt_t* v, int k, int n, int x) {
int m = k > n ? n : k;
- int l = v->mem[1]; //D(0);
+ int i;
int j;
- int i;
- for (i = 0; i < m; i++) {
- j = l;
- WS(i) -= fix_mul( ( l = WD(i) ) + j , x);
+ int *w;
+ int *l;
+ w = l = v->mem + 1;
+ for (i = m; --i>=0;) {
+ *(w-1) -= fix_mul( *l + *w , x);
+ l = w;
+ w += 2;
}
- if( i < k ) {
- l = fix_mul( l + l , x );
- for (; i < k; i++)
- WS(i) -= l;
+ if( m < k ) {
+ int y = fix_mul( *l + *l , x );
+ for (; m < k; m++, w+=2)
+ *(w-1) -= y;
}
}
static void dwt_decode_sp(dwt_t* v, int k, int n, int x) {
int m = k > n ? n : k;
- int l = v->mem[1]; //D(0);
+ int i;
int j;
- int i;
- for (i = 0; i < m; i++) {
- j = l;
- WS(i) += fix_mul( ( l = WD(i) ) + j , x);
+ int *w;
+ int *l;
+ w = l = v->mem + 1;
+ for (i = m; --i>=0;) {
+ *(w-1) += fix_mul( *l + *w , x);
+ l = w;
+ w += 2;
}
- if( i < k ) {
- l = fix_mul( l + l , x );
- for (; i < k; i++)
- WS(i) += l;
+ if( m < k ) {
+ int y = fix_mul( *l + *l , x );
+ for (; m < k; m++, w+=2)
+ *(w-1) += y;
}
}
+
static void dwt_decode_dm(dwt_t* v, int k, int n, int x) {
int m = k >= n ? n-1 : k;
- int l = v->mem[0]; //S(0);
int i;
int j;
- for (i = 0; i < m; i++) {
- j = l;
- WD(i) -= fix_mul( ( l = WS(i+1) ) + j , x);
+ int *w;
+ int *l;
+ w = ( l = v->mem ) + 2;
+ for (i = m; --i>=0;) {
+ *(w-1) -= fix_mul( *l + *w , x);
+ l = w;
+ w += 2;
}
- if( i < k ) {
- l = fix_mul( l + l , x );
- for (; i < k; i++)
- WD(i) -= l;
+ if( m < k ) {
+ int y = fix_mul( *l + *l , x );
+ for (; m < k; m++, w+=2)
+ *(w-1) -= y;
}
}
static void dwt_decode_dp(dwt_t* v, int k, int n, int x) {
int m = k >= n ? n-1 : k;
- int l = v->mem[0]; //S(0);
int i;
int j;
- for (i = 0; i < m; i++) {
- j = l;
- WD(i) += fix_mul( ( l = WS(i+1) ) + j , x);
+ int *w;
+ int *l;
+ w = ( l = v->mem ) + 2;
+ for (i = m; --i>=0;) {
+ *(w-1) += fix_mul( *l + *w , x);
+ l = w;
+ w += 2;
}
-
- if( i < k ) {
- l = fix_mul( l + l , x );
- for (; i < k; i++)
- WD(i) += l;
+ if( m < k ) {
+ int y = fix_mul( *l + *l , x );
+ for (; m < k; m++, w+=2)
+ *(w-1) += y;
}
}
@@ -538,7 +585,11 @@
/* Inverse 9-7 wavelet transform in 2-D. */
/* </summary> */
void dwt_decode_real(opj_tcd_tilecomp_t * tilec, int stop) {
+#ifdef DWT_FP_VECTORIZATION
+ dwt_decode_tile_fp(tilec, stop);
+#else
dwt_decode_tile(tilec, stop, dwt_decode_1_real);
+#endif
}
@@ -659,3 +710,230 @@
}
opj_free(m);
}
+
+
+#ifdef DWT_FP_VECTORIZATION
+
+static void v4dwt_interleave_h( v4dwt_t* w , float* a , int x ) {
+ int i;
+ float *ai;
+ v4 *bi;
+ ai = a ;
+ bi = w->wavelet + w->cas;
+ i = w->sn ;
+ while( i-- )
+ {
+ bi->f[0] = *ai;
+ bi->f[1] = *(ai + x);
+ bi->f[2] = *(ai + x*2);
+ bi->f[3] = *(ai + x*3);
+ bi += 2;
+ ai++;
+ }
+ ai = a + w->sn ; // update: NOT NEEDED ?
+ bi = w->wavelet + 1 - w->cas ;
+ i = w->dn ;
+ while( i-- )
+ {
+ bi->f[0] = *ai;
+ bi->f[1] = *(ai + x);
+ bi->f[2] = *(ai + x*2);
+ bi->f[3] = *(ai + x*3);
+ bi += 2;
+ ai++;
+ }
+}
+
+static void v4dwt_interleave_v( v4dwt_t* w , float* a , int x ) {
+ int i;
+ float *ai;
+ v4 *bi;
+
+ ai = a;
+ bi = w->wavelet + w->cas;
+ i = w->sn ;
+ while( i-- ) {
+ bi->f[0] = *ai;
+ bi->f[1] = *(ai+1);
+ bi->f[2] = *(ai+2);
+ bi->f[3] = *(ai+3);
+ bi += 2;
+ ai += x;
+ }
+ ai = a + (w->sn * x);
+ bi = w->wavelet + 1 - w->cas;
+ i = w->dn ;
+ while( i-- ) {
+ bi->f[0] = *ai;
+ bi->f[1] = *(ai+1);
+ bi->f[2] = *(ai+2);
+ bi->f[3] = *(ai+3);
+ bi += 2;
+ ai += x;
+ }
+}
+
+static void v4dwt_decode_s(v4 *t, int k, int n, const v4 c) {
+ int m = k > n ? n : k;
+ int i;
+ int j;
+ v4 *w;
+ v4 *l;
+ w = l = t;
+ for (i = m; --i>=0;) {
+ (w-1)->v += ( l->v + w->v ) * c.v;
+ l = w;
+ w += 2;
+ }
+ for (; m < k; m++, w+=2)
+ (w-1)->v += ( l->v + l->v ) * c.v;
+}
+
+
+static void v4dwt_decode_d(v4 *t, int k, int n, const v4 c) {
+ int m = k >= n ? n-1 : k;
+ int i;
+ int j;
+ v4 *w;
+ v4 *l;
+ w = ( l = t ) + 2;
+ for (i = m; --i>=0;) {
+ (w-1)->v += ( l->v + w->v ) * c.v;
+ l = w;
+ w += 2;
+ }
+ for (; m < k; m++, w+=2)
+ (w-1)->v += ( l->v + l->v ) * c.v;
+}
+
+/* <summary> */
+/* Inverse 9-7 wavelet transform in 1-D. */
+/* </summary> */
+static void v4dwt_decode(v4dwt_t* w) {
+ int i;
+ int sn = w->sn;
+ int dn = w->dn;
+ v4 *f;
+ v4 *s = w->wavelet;
+ v4 *d = s + 1;
+ if (w->cas == 0) {
+ if ((dn > 0) || (sn > 1)) {
+ for( f = s - 2, i = sn; --i>=0;)
+ ( f += 2 )->v *= v10078.v ;
+ for( f = d - 2, i = dn; --i>=0;)
+ ( f += 2 )->v *= v13318.v ;
+ v4dwt_decode_s(d, sn, dn, v3633);
+ v4dwt_decode_d(s, dn, sn, v7233);
+ v4dwt_decode_s(d, sn, dn, v434);
+ v4dwt_decode_d(s, dn, sn, v12994);
+ }
+ } else {
+ if ((sn > 0) || (dn > 1)) {
+ for( f = d - 2, i = sn; --i>=0;)
+ ( f += 2 )->v *= v10078.v ;
+ for( f = s - 2, i = dn; --i>=0;)
+ ( f += 2 )->v *= v13318.v ;
+ v4dwt_decode_d(s, sn, dn, v3633);
+ v4dwt_decode_s(d, dn, sn, v7233);
+ v4dwt_decode_d(s, sn, dn, v434);
+ v4dwt_decode_s(d, dn, sn, v12994);
+ }
+ }
+}
+
+
+/* <summary> */
+/* Inverse wavelet tranform in 2-D. */
+/* </summary> */
+static void dwt_decode_tile_fp(opj_tcd_tilecomp_t * tilec, int stop) {
+ opj_tcd_resolution_t* tr;
+ int i, j, k;
+ float *a = NULL;
+ float *aj = NULL;
+ void *m;
+ int w;
+ int rw; /* width of the resolution level computed */
+ int rh; /* heigth of the resolution level computed */
+ v4dwt_t h;
+ v4dwt_t v;
+ union {
+ int i;
+ float f;
+ } *q;
+
+ if( 1 > ( i = tilec->numresolutions - stop ) )
+ return ;
+
+ tr = tilec->resolutions;
+
+ w = tilec->x1-tilec->x0;
+ a = (void*)tilec->data;
+
+ m = (void*)opj_malloc(sizeof(v4) * (dwt_decode_max_resolution(tr, i)+5));
+ h.wavelet = v.wavelet = (v4*)( (unsigned)m + 16 - ( (unsigned)m % 16 ) ) ;
+
+ rw = tr->x1 - tr->x0;
+ rh = tr->y1 - tr->y0;
+
+ for( q = (void *)a, j = w * (tilec->y1 - tilec->y0)/*rh*/ ; --j>=0;)
+ (q++)->f = (float)q->i / 8192.0;
+
+ while( --i ) {
+ tr++;
+ h.sn = rw;
+ v.sn = rh;
+ h.dn = ( rw = tr->x1 - tr->x0 ) - h.sn;
+ v.dn = ( rh = tr->y1 - tr->y0 ) - v.sn;
+ h.cas = tr->x0 % 2;
+ v.cas = tr->y0 % 2;
+
+ for( aj = a, j = rh/4; --j>=0; aj+=w*4) {
+ v4dwt_interleave_h(&h, aj, w);
+ v4dwt_decode(&h);
+ for( k=rw; --k>=0;) {
+ aj[k ] = h.wavelet[k].f[0];
+ aj[k+w ] = h.wavelet[k].f[1];
+ aj[k+w*2] = h.wavelet[k].f[2];
+ aj[k+w*3] = h.wavelet[k].f[3];
+ }
+ }
+ if( ( j = rh % 4 ) ) {
+ v4dwt_interleave_h(&h, aj, w);
+ v4dwt_decode(&h);
+ for( k=rw; --k>=0;) {
+ switch(j) {
+ case 3: aj[k+w*2] = h.wavelet[k].f[2];
+ case 2: aj[k+w ] = h.wavelet[k].f[1];
+ case 1: aj[k ] = h.wavelet[k].f[0];
+ }
+ }
+ }
+ for( aj = a, j = rw/4; --j>=0; aj+=4) {
+ v4dwt_interleave_v(&v, aj, w);
+ v4dwt_decode(&v);
+ for( k=rh; --k>=0;) {
+ aj[k*w ] = v.wavelet[k].f[0];
+ aj[1+k*w] = v.wavelet[k].f[1];
+ aj[2+k*w] = v.wavelet[k].f[2];
+ aj[3+k*w] = v.wavelet[k].f[3];
+ }
+ }
+
+ if( ( j = rw % 4 ) ) {
+ v4dwt_interleave_v(&v, aj, w);
+ v4dwt_decode(&v);
+ for( k=rh; --k>=0;) {
+ switch(j) {
+ case 3: aj[2+k*w] = v.wavelet[k].f[2];
+ case 2: aj[1+k*w] = v.wavelet[k].f[1];
+ case 1: aj[k*w ] = v.wavelet[k].f[0];
+ }
+ }
+ }
+ }
+ for( q = (void *)a, j = w * rh; --j>=0;)
+ (q++)->i = floorf(q->f * 8192.0);
+ opj_free(m);
+}
+
+#endif // DWT_FP_VECTORIZATION
Index: libopenjpeg/fix.h
===================================================================
--- libopenjpeg/fix.h (revision 372)
+++ libopenjpeg/fix.h (working copy)
@@ -48,15 +48,14 @@
/*@{*/
/**
-Multiply two fixed-precision rational numbers.
+Multiply two fixed-precision rational numbers. (32bit)
@param a
@param b
@return Returns a * b
*/
static INLINE int fix_mul(int a, int b) {
int64 temp = (int64) a * (int64) b ;
- temp += temp & 4096;
- return (int) (temp >> 13) ;
+ return (int) ((temp + (temp & 4096)) >> 13) ;
}
/*@}*/
More information about the SLDev
mailing list