Merge pull request #374 from lovell/conv-allow-vectorise
Improve convolution performance by 20-25%
This commit is contained in:
commit
baf5e860e3
@ -628,11 +628,6 @@ conv_start( IMAGE *out, void *a, void *b )
|
||||
return( seq );
|
||||
}
|
||||
|
||||
#define INNER { \
|
||||
sum += t[i] * p[i][x]; \
|
||||
i += 1; \
|
||||
}
|
||||
|
||||
/* INT inner loops.
|
||||
*/
|
||||
#define CONV_INT( TYPE, IM_CLIP ) { \
|
||||
@ -640,15 +635,14 @@ conv_start( IMAGE *out, void *a, void *b )
|
||||
TYPE * restrict q = (TYPE *) IM_REGION_ADDR( or, le, y ); \
|
||||
\
|
||||
for( x = 0; x < sz; x++ ) { \
|
||||
int sum; \
|
||||
int i; \
|
||||
\
|
||||
sum = 0; \
|
||||
i = 0; \
|
||||
IM_UNROLL( conv->nnz, INNER ); \
|
||||
\
|
||||
int sum = 0; \
|
||||
\
|
||||
for ( i = 0; i < nnz; i++ ) \
|
||||
sum += t[i] * p[i][x]; \
|
||||
\
|
||||
sum = ((sum + rounding) / mask->scale) + mask->offset; \
|
||||
\
|
||||
\
|
||||
IM_CLIP; \
|
||||
\
|
||||
q[x] = sum; \
|
||||
@ -662,12 +656,11 @@ conv_start( IMAGE *out, void *a, void *b )
|
||||
TYPE * restrict q = (TYPE *) IM_REGION_ADDR( or, le, y ); \
|
||||
\
|
||||
for( x = 0; x < sz; x++ ) { \
|
||||
double sum; \
|
||||
int i; \
|
||||
\
|
||||
sum = 0; \
|
||||
i = 0; \
|
||||
IM_UNROLL( conv->nnz, INNER ); \
|
||||
double sum = 0; \
|
||||
\
|
||||
for ( i = 0; i < nnz; i++ ) \
|
||||
sum += t[i] * p[i][x]; \
|
||||
\
|
||||
sum = (sum / mask->scale) + mask->offset; \
|
||||
\
|
||||
@ -686,6 +679,7 @@ conv_gen( REGION *or, void *vseq, void *a, void *b )
|
||||
REGION *ir = seq->ir;
|
||||
INTMASK *mask = conv->mask;
|
||||
int * restrict t = conv->coeff;
|
||||
const int nnz = conv->nnz;
|
||||
|
||||
/* You might think this should be (scale + 1) / 2, but then we'd be
|
||||
* adding one for scale == 1.
|
||||
@ -718,7 +712,7 @@ conv_gen( REGION *or, void *vseq, void *a, void *b )
|
||||
if( seq->last_bpl != IM_REGION_LSKIP( ir ) ) {
|
||||
seq->last_bpl = IM_REGION_LSKIP( ir );
|
||||
|
||||
for( i = 0; i < conv->nnz; i++ ) {
|
||||
for( i = 0; i < nnz; i++ ) {
|
||||
z = conv->coeff_pos[i];
|
||||
x = z % conv->mask->xsize;
|
||||
y = z / conv->mask->xsize;
|
||||
@ -732,7 +726,7 @@ conv_gen( REGION *or, void *vseq, void *a, void *b )
|
||||
for( y = to; y < bo; y++ ) {
|
||||
/* Init pts for this line of PELs.
|
||||
*/
|
||||
for( z = 0; z < conv->nnz; z++ )
|
||||
for( z = 0; z < nnz; z++ )
|
||||
seq->pts[z] = seq->offsets[z] +
|
||||
IM_REGION_ADDR( ir, le, y );
|
||||
|
||||
|
@ -204,22 +204,16 @@ conv_start( IMAGE *out, void *a, void *b )
|
||||
return( (void *) seq );
|
||||
}
|
||||
|
||||
#define INNER { \
|
||||
sum += t[i] * p[i][x]; \
|
||||
i += 1; \
|
||||
}
|
||||
|
||||
#define CONV_FLOAT( ITYPE, OTYPE ) { \
|
||||
ITYPE ** restrict p = (ITYPE **) seq->pts; \
|
||||
OTYPE * restrict q = (OTYPE *) IM_REGION_ADDR( or, le, y ); \
|
||||
\
|
||||
for( x = 0; x < sz; x++ ) { \
|
||||
double sum; \
|
||||
int i; \
|
||||
\
|
||||
sum = 0; \
|
||||
i = 0; \
|
||||
IM_UNROLL( conv->nnz, INNER ); \
|
||||
double sum = 0; \
|
||||
\
|
||||
for ( i = 0; i < nnz; i++ ) \
|
||||
sum += t[i] * p[i][x]; \
|
||||
\
|
||||
sum = (sum / mask->scale) + mask->offset; \
|
||||
\
|
||||
@ -238,6 +232,7 @@ conv_gen( REGION *or, void *vseq, void *a, void *b )
|
||||
REGION *ir = seq->ir;
|
||||
DOUBLEMASK *mask = conv->mask;
|
||||
double * restrict t = conv->coeff;
|
||||
const int nnz = conv->nnz;
|
||||
|
||||
Rect *r = &or->valid;
|
||||
Rect s;
|
||||
@ -264,7 +259,7 @@ conv_gen( REGION *or, void *vseq, void *a, void *b )
|
||||
if( seq->last_bpl != IM_REGION_LSKIP( ir ) ) {
|
||||
seq->last_bpl = IM_REGION_LSKIP( ir );
|
||||
|
||||
for( i = 0; i < conv->nnz; i++ ) {
|
||||
for( i = 0; i < nnz; i++ ) {
|
||||
z = conv->coeff_pos[i];
|
||||
x = z % conv->mask->xsize;
|
||||
y = z / conv->mask->xsize;
|
||||
@ -278,8 +273,8 @@ conv_gen( REGION *or, void *vseq, void *a, void *b )
|
||||
for( y = to; y < bo; y++ ) {
|
||||
/* Init pts for this line of PELs.
|
||||
*/
|
||||
for( z = 0; z < conv->nnz; z++ )
|
||||
seq->pts[z] = seq->offsets[z] +
|
||||
for( z = 0; z < nnz; z++ )
|
||||
seq->pts[z] = seq->offsets[z] +
|
||||
IM_REGION_ADDR( ir, le, y );
|
||||
|
||||
switch( in->BandFmt ) {
|
||||
|
@ -137,7 +137,6 @@ extern "C" {
|
||||
#define NUMBER(R) IM_NUMBER(R)
|
||||
#define ARRAY(IM,N,T) IM_ARRAY(IM,N,T)
|
||||
|
||||
#define UNROLL( N, OPER ) IM_UNROLL( N, OPER )
|
||||
#define RINT( R ) IM_RINT( R )
|
||||
|
||||
#define CLIP_UCHAR( V, SEQ ) IM_CLIP_UCHAR( V, SEQ )
|
||||
|
@ -165,7 +165,6 @@ extern "C" {
|
||||
#define IM_CLIP_USHORT VIPS_CLIP_USHORT
|
||||
#define IM_CLIP_SHORT VIPS_CLIP_SHORT
|
||||
#define IM_CLIP_NONE VIPS_CLIP_NONE
|
||||
#define IM_UNROLL VIPS_UNROLL
|
||||
#define IM_SWAP VIPS_SWAP
|
||||
|
||||
#define IM_IMAGE_ADDR VIPS_IMAGE_ADDR
|
||||
|
Loading…
Reference in New Issue
Block a user