Merge pull request #374 from lovell/conv-allow-vectorise

Improve convolution performance by 20-25%
This commit is contained in:
John Cupitt 2016-01-26 09:18:11 +00:00
commit baf5e860e3
4 changed files with 21 additions and 34 deletions

View File

@ -628,11 +628,6 @@ conv_start( IMAGE *out, void *a, void *b )
return( seq );
}
#define INNER { \
sum += t[i] * p[i][x]; \
i += 1; \
}
/* INT inner loops.
*/
#define CONV_INT( TYPE, IM_CLIP ) { \
@ -640,15 +635,14 @@ conv_start( IMAGE *out, void *a, void *b )
TYPE * restrict q = (TYPE *) IM_REGION_ADDR( or, le, y ); \
\
for( x = 0; x < sz; x++ ) { \
int sum; \
int i; \
\
sum = 0; \
i = 0; \
IM_UNROLL( conv->nnz, INNER ); \
\
int sum = 0; \
\
for ( i = 0; i < nnz; i++ ) \
sum += t[i] * p[i][x]; \
\
sum = ((sum + rounding) / mask->scale) + mask->offset; \
\
\
IM_CLIP; \
\
q[x] = sum; \
@ -662,12 +656,11 @@ conv_start( IMAGE *out, void *a, void *b )
TYPE * restrict q = (TYPE *) IM_REGION_ADDR( or, le, y ); \
\
for( x = 0; x < sz; x++ ) { \
double sum; \
int i; \
\
sum = 0; \
i = 0; \
IM_UNROLL( conv->nnz, INNER ); \
double sum = 0; \
\
for ( i = 0; i < nnz; i++ ) \
sum += t[i] * p[i][x]; \
\
sum = (sum / mask->scale) + mask->offset; \
\
@ -686,6 +679,7 @@ conv_gen( REGION *or, void *vseq, void *a, void *b )
REGION *ir = seq->ir;
INTMASK *mask = conv->mask;
int * restrict t = conv->coeff;
const int nnz = conv->nnz;
/* You might think this should be (scale + 1) / 2, but then we'd be
* adding one for scale == 1.
@ -718,7 +712,7 @@ conv_gen( REGION *or, void *vseq, void *a, void *b )
if( seq->last_bpl != IM_REGION_LSKIP( ir ) ) {
seq->last_bpl = IM_REGION_LSKIP( ir );
for( i = 0; i < conv->nnz; i++ ) {
for( i = 0; i < nnz; i++ ) {
z = conv->coeff_pos[i];
x = z % conv->mask->xsize;
y = z / conv->mask->xsize;
@ -732,7 +726,7 @@ conv_gen( REGION *or, void *vseq, void *a, void *b )
for( y = to; y < bo; y++ ) {
/* Init pts for this line of PELs.
*/
for( z = 0; z < conv->nnz; z++ )
for( z = 0; z < nnz; z++ )
seq->pts[z] = seq->offsets[z] +
IM_REGION_ADDR( ir, le, y );

View File

@ -204,22 +204,16 @@ conv_start( IMAGE *out, void *a, void *b )
return( (void *) seq );
}
#define INNER { \
sum += t[i] * p[i][x]; \
i += 1; \
}
#define CONV_FLOAT( ITYPE, OTYPE ) { \
ITYPE ** restrict p = (ITYPE **) seq->pts; \
OTYPE * restrict q = (OTYPE *) IM_REGION_ADDR( or, le, y ); \
\
for( x = 0; x < sz; x++ ) { \
double sum; \
int i; \
\
sum = 0; \
i = 0; \
IM_UNROLL( conv->nnz, INNER ); \
double sum = 0; \
\
for ( i = 0; i < nnz; i++ ) \
sum += t[i] * p[i][x]; \
\
sum = (sum / mask->scale) + mask->offset; \
\
@ -238,6 +232,7 @@ conv_gen( REGION *or, void *vseq, void *a, void *b )
REGION *ir = seq->ir;
DOUBLEMASK *mask = conv->mask;
double * restrict t = conv->coeff;
const int nnz = conv->nnz;
Rect *r = &or->valid;
Rect s;
@ -264,7 +259,7 @@ conv_gen( REGION *or, void *vseq, void *a, void *b )
if( seq->last_bpl != IM_REGION_LSKIP( ir ) ) {
seq->last_bpl = IM_REGION_LSKIP( ir );
for( i = 0; i < conv->nnz; i++ ) {
for( i = 0; i < nnz; i++ ) {
z = conv->coeff_pos[i];
x = z % conv->mask->xsize;
y = z / conv->mask->xsize;
@ -278,8 +273,8 @@ conv_gen( REGION *or, void *vseq, void *a, void *b )
for( y = to; y < bo; y++ ) {
/* Init pts for this line of PELs.
*/
for( z = 0; z < conv->nnz; z++ )
seq->pts[z] = seq->offsets[z] +
for( z = 0; z < nnz; z++ )
seq->pts[z] = seq->offsets[z] +
IM_REGION_ADDR( ir, le, y );
switch( in->BandFmt ) {

View File

@ -137,7 +137,6 @@ extern "C" {
#define NUMBER(R) IM_NUMBER(R)
#define ARRAY(IM,N,T) IM_ARRAY(IM,N,T)
#define UNROLL( N, OPER ) IM_UNROLL( N, OPER )
#define RINT( R ) IM_RINT( R )
#define CLIP_UCHAR( V, SEQ ) IM_CLIP_UCHAR( V, SEQ )

View File

@ -165,7 +165,6 @@ extern "C" {
#define IM_CLIP_USHORT VIPS_CLIP_USHORT
#define IM_CLIP_SHORT VIPS_CLIP_SHORT
#define IM_CLIP_NONE VIPS_CLIP_NONE
#define IM_UNROLL VIPS_UNROLL
#define IM_SWAP VIPS_SWAP
#define IM_IMAGE_ADDR VIPS_IMAGE_ADDR