multipass mode for im_conv()
This commit is contained in:
parent
593af5a8d8
commit
425795a8c8
@ -42,6 +42,7 @@
|
||||
- land the vector branmch ... we have SSE erode/dilate/add/conv
|
||||
- add IM_SWAP
|
||||
- dilate/erode do (!=0) on non-uchar images
|
||||
- add multipass Orc to im_conv(), 3.5x faster for 5x5 mask
|
||||
|
||||
12/5/10 started 7.22.2
|
||||
- the conditional image of ifthenelse can be any format, a (!=0) is added if
|
||||
|
4
TODO
4
TODO
@ -1,6 +1,6 @@
|
||||
- im_conv() might now give a speedup on larger masks and with 16 bits?
|
||||
- try a 32-bit intermediate for im_conv()
|
||||
|
||||
- im_profile() next
|
||||
- gtkdoc for im_profile() next
|
||||
|
||||
|
||||
|
||||
|
@ -61,6 +61,8 @@
|
||||
* - use VipsVector
|
||||
* - get rid of im_convsep(), just call this twice, no longer worth
|
||||
* keeping two versions
|
||||
* 8/11/10
|
||||
* - add array tiling
|
||||
*/
|
||||
|
||||
/*
|
||||
@ -103,26 +105,11 @@
|
||||
|
||||
- will this change make much difference to the vips benchmark?
|
||||
|
||||
- would setting params by index rather than name be any quicker?
|
||||
|
||||
- fix up a signed 8-bit code path?
|
||||
|
||||
- try a path with a 32-bit sum for larger matrices / scale / offset,
|
||||
much slower?
|
||||
|
||||
- try a 16-bit path, though the speedup might not be worthwhile
|
||||
- make up a signed 8-bit code path?
|
||||
|
||||
- with a 5x5 matrix:
|
||||
|
||||
5 5 62 0
|
||||
0 1 1 1 0
|
||||
1 4 6 4 1
|
||||
1 6 10 6 1
|
||||
1 4 6 4 1
|
||||
0 1 1 1 0
|
||||
|
||||
Orc is no faster than C, argh, multipass is not worthwhile for
|
||||
large matrices
|
||||
- try a 16-bit path
|
||||
|
||||
*/
|
||||
|
||||
@ -142,6 +129,22 @@
|
||||
#include <dmalloc.h>
|
||||
#endif /*WITH_DMALLOC*/
|
||||
|
||||
/* We can't run more than this many passes. Larger than this and we
|
||||
* fall back to C.
|
||||
*/
|
||||
#define MAX_PASS (10)
|
||||
|
||||
/* A pass with a vector.
|
||||
*/
|
||||
typedef struct {
|
||||
int first; /* The index of the first mask coff we use */
|
||||
int last; /* The index of the last mask coff we use */
|
||||
|
||||
/* The code we generate for this section of this mask.
|
||||
*/
|
||||
VipsVector *vector;
|
||||
} Pass;
|
||||
|
||||
/* Our parameters ... we take a copy of the mask argument, plus we make a
|
||||
* smaller version with the zeros squeezed out.
|
||||
*/
|
||||
@ -160,14 +163,20 @@ typedef struct {
|
||||
/* The convolver we generate for this mask. We have to split the
|
||||
* convolve and clip into two phases.
|
||||
*/
|
||||
VipsVector *convolve;
|
||||
int n_pass;
|
||||
Pass pass[MAX_PASS];
|
||||
VipsVector *clip;
|
||||
} Conv;
|
||||
|
||||
static void
|
||||
conv_vector_free( Conv *conv )
|
||||
{
|
||||
IM_FREEF( vips_vector_free, conv->convolve );
|
||||
int i;
|
||||
|
||||
for( i = 0; i < conv->n_pass; i++ )
|
||||
IM_FREEF( vips_vector_free, conv->pass[i].vector );
|
||||
conv->n_pass = 0;
|
||||
|
||||
IM_FREEF( vips_vector_free, conv->clip );
|
||||
}
|
||||
|
||||
@ -210,17 +219,16 @@ conv_evalend( Conv *conv )
|
||||
#define ASM2( OP, A, B ) vips_vector_asm2( v, OP, A, B )
|
||||
#define ASM3( OP, A, B, C ) vips_vector_asm3( v, OP, A, B, C )
|
||||
|
||||
/* Generate code for a 3x3 mask. Just do multiply-add, a second pass does the
|
||||
* round and clip.
|
||||
/* Generate code for a section of the mask.
|
||||
*
|
||||
* 0 for success, -1 on error.
|
||||
*/
|
||||
static int
|
||||
conv_compile_convolution_u8s16( Conv *conv )
|
||||
conv_compile_convolution_u8s16_section( Pass *pass, Conv *conv )
|
||||
{
|
||||
INTMASK *mask = conv->mask;
|
||||
const int n_mask = mask->xsize * mask->ysize;
|
||||
|
||||
double min, max;
|
||||
int i;
|
||||
VipsVector *v;
|
||||
char zero[256];
|
||||
@ -228,34 +236,7 @@ conv_compile_convolution_u8s16( Conv *conv )
|
||||
char source[256];
|
||||
char coeff[256];
|
||||
|
||||
if( conv->in->BandFmt != IM_BANDFMT_UCHAR )
|
||||
return( -1 );
|
||||
|
||||
/* Don't test mask size, it's very hard to predict when we will
|
||||
* exhaust the program space.
|
||||
*/
|
||||
|
||||
/* Can the accumulator overflow or underflow at any stage? Since
|
||||
* matrix elements are signed, we need to calculate a running
|
||||
* possible min and max.
|
||||
*/
|
||||
min = 0;
|
||||
max = 0;
|
||||
for( i = 0; i < mask->xsize * mask->ysize; i++ ) {
|
||||
int v = 255 * mask->coeff[i];
|
||||
|
||||
if( min + v < min )
|
||||
min += v;
|
||||
else if( min + v > max )
|
||||
max += v;
|
||||
|
||||
if( max > SHRT_MAX )
|
||||
return( -1 );
|
||||
if( min < SHRT_MIN )
|
||||
return( -1 );
|
||||
}
|
||||
|
||||
conv->convolve = v = vips_vector_new( "conv", 2 );
|
||||
pass->vector = v = vips_vector_new( "conv", 2 );
|
||||
|
||||
/* The value we fetch from the image, the product with the matrix
|
||||
* value, the accumulated sum.
|
||||
@ -267,7 +248,7 @@ conv_compile_convolution_u8s16( Conv *conv )
|
||||
CONST( zero, 0, 2 );
|
||||
ASM2( "copyw", "sum", zero );
|
||||
|
||||
for( i = 0; i < mask->xsize * mask->ysize; i++ ) {
|
||||
for( i = pass->first; i < n_mask; i++ ) {
|
||||
int x = i % mask->xsize;
|
||||
int y = i / mask->xsize;
|
||||
|
||||
@ -315,12 +296,12 @@ conv_compile_convolution_u8s16( Conv *conv )
|
||||
|
||||
ASM3( "addssw", "sum", "sum", "product" );
|
||||
|
||||
/* If we run out of space, fall back to C.
|
||||
*/
|
||||
if( vips_vector_full( v ) )
|
||||
return( -1 );
|
||||
break;
|
||||
}
|
||||
|
||||
pass->last = i;
|
||||
|
||||
ASM2( "copyw", "d1", "sum" );
|
||||
|
||||
if( !vips_vector_compile( v ) )
|
||||
@ -333,7 +314,83 @@ conv_compile_convolution_u8s16( Conv *conv )
|
||||
return( 0 );
|
||||
}
|
||||
|
||||
/* Generate the program that does (sum + rounding) / scale + offset
|
||||
/* Generate the convolution pass for u8 data with an s16 accumulator.
|
||||
*
|
||||
* 0 for success, -1 on error.
|
||||
*/
|
||||
static int
|
||||
conv_compile_convolution_u8s16( Conv *conv )
|
||||
{
|
||||
INTMASK *mask = conv->mask;
|
||||
const int n_mask = mask->xsize * mask->ysize;
|
||||
|
||||
double min, max;
|
||||
int i;
|
||||
|
||||
if( conv->in->BandFmt != IM_BANDFMT_UCHAR )
|
||||
return( -1 );
|
||||
|
||||
/* Can the accumulator overflow or underflow at any stage? Since
|
||||
* matrix elements are signed, we need to calculate a running
|
||||
* possible min and max.
|
||||
*/
|
||||
min = 0;
|
||||
max = 0;
|
||||
for( i = 0; i < n_mask; i++ ) {
|
||||
int v = 255 * mask->coeff[i];
|
||||
|
||||
if( min + v < min )
|
||||
min += v;
|
||||
else if( min + v > max )
|
||||
max += v;
|
||||
|
||||
if( max > SHRT_MAX )
|
||||
return( -1 );
|
||||
if( min < SHRT_MIN )
|
||||
return( -1 );
|
||||
}
|
||||
|
||||
/* Generate passes until we've used up the whole mask.
|
||||
*/
|
||||
for( i = 0;;) {
|
||||
Pass *pass;
|
||||
|
||||
/* Skip any zero coefficients at the start of the mask
|
||||
* region.
|
||||
*/
|
||||
for( ; i < n_mask && !mask->coeff[i]; i++ )
|
||||
;
|
||||
if( i == n_mask )
|
||||
break;
|
||||
|
||||
/* Allocate space for another pass.
|
||||
*/
|
||||
if( conv->n_pass == MAX_PASS )
|
||||
return( -1 );
|
||||
pass = &conv->pass[conv->n_pass];
|
||||
conv->n_pass += 1;
|
||||
|
||||
pass->first = i;
|
||||
pass->last = i;
|
||||
|
||||
if( conv_compile_convolution_u8s16_section( pass, conv ) )
|
||||
return( -1 );
|
||||
i = pass->last + 1;
|
||||
|
||||
#ifdef DEBUG
|
||||
printf( "conv_compile_convolution_u8s16: "
|
||||
"first = %d, last = %d\n",
|
||||
pass->first, pass->last );
|
||||
#endif /*DEBUG*/
|
||||
|
||||
if( i >= n_mask )
|
||||
break;
|
||||
}
|
||||
|
||||
return( 0 );
|
||||
}
|
||||
|
||||
/* Generate the program that does (sum(passes) + rounding) / scale + offset
|
||||
* from a s16 intermediate back to a u8 output.
|
||||
*/
|
||||
static int
|
||||
@ -341,6 +398,7 @@ conv_compile_scale_s16u8( Conv *conv )
|
||||
{
|
||||
INTMASK *mask = conv->mask;
|
||||
|
||||
int i;
|
||||
VipsVector *v;
|
||||
char scale[256];
|
||||
char offset[256];
|
||||
@ -355,7 +413,12 @@ conv_compile_scale_s16u8( Conv *conv )
|
||||
return( -1 );
|
||||
|
||||
conv->clip = v = vips_vector_new( "clip", 1 );
|
||||
vips_vector_source_name( v, "s1", 2 );
|
||||
for( i = 0; i < conv->n_pass; i++ ) {
|
||||
char source[10];
|
||||
|
||||
im_snprintf( source, 10, "s%d", i );
|
||||
vips_vector_source_name( v, source, 2 );
|
||||
}
|
||||
|
||||
TEMP( "t1", 2 );
|
||||
TEMP( "t2", 2 );
|
||||
@ -371,9 +434,19 @@ conv_compile_scale_s16u8( Conv *conv )
|
||||
CONST( offset, mask->offset * mask->scale + mask->scale / 2, 2 );
|
||||
CONST( zero, 0, 2 );
|
||||
|
||||
/* Sum the passes into t1.
|
||||
*/
|
||||
ASM2( "loadw", "t1", "s0" );
|
||||
for( i = 1; i < conv->n_pass; i++ ) {
|
||||
char source[10];
|
||||
|
||||
im_snprintf( source, 10, "s%d", i );
|
||||
ASM3( "addssw", "t1", "t1", source );
|
||||
}
|
||||
|
||||
/* Offset and scale.
|
||||
*/
|
||||
ASM3( "addssw", "t1", "s1", offset );
|
||||
ASM3( "addssw", "t1", "t1", offset );
|
||||
|
||||
/* We need to convert the signed result of the
|
||||
* offset to unsigned for the div, ie. we want to set anything <0 to 0.
|
||||
@ -384,7 +457,8 @@ conv_compile_scale_s16u8( Conv *conv )
|
||||
ASM3( "divluw", "t1", "t1", scale );
|
||||
ASM2( "convuuswb", "d1", "t1" );
|
||||
|
||||
if( !vips_vector_compile( v ) )
|
||||
if( vips_vector_full( v ) ||
|
||||
!vips_vector_compile( v ) )
|
||||
return( -1 );
|
||||
|
||||
#ifdef DEBUG
|
||||
@ -398,7 +472,7 @@ static Conv *
|
||||
conv_new( IMAGE *in, IMAGE *out, INTMASK *mask )
|
||||
{
|
||||
Conv *conv = IM_NEW( out, Conv );
|
||||
const int ne = mask->xsize * mask->ysize;
|
||||
const int n_mask = mask->xsize * mask->ysize;
|
||||
int i;
|
||||
|
||||
if( !conv )
|
||||
@ -413,7 +487,7 @@ conv_new( IMAGE *in, IMAGE *out, INTMASK *mask )
|
||||
conv->underflow = 0;
|
||||
conv->overflow = 0;
|
||||
|
||||
conv->convolve = NULL;
|
||||
conv->n_pass = 0;
|
||||
conv->clip = NULL;
|
||||
|
||||
if( im_add_close_callback( out,
|
||||
@ -422,14 +496,14 @@ conv_new( IMAGE *in, IMAGE *out, INTMASK *mask )
|
||||
(im_callback_fn) conv_evalstart, conv, NULL ) ||
|
||||
im_add_close_callback( out,
|
||||
(im_callback_fn) conv_evalend, conv, NULL ) ||
|
||||
!(conv->coeff = IM_ARRAY( out, ne, int )) ||
|
||||
!(conv->coeff_pos = IM_ARRAY( out, ne, int )) ||
|
||||
!(conv->coeff = IM_ARRAY( out, n_mask, int )) ||
|
||||
!(conv->coeff_pos = IM_ARRAY( out, n_mask, int )) ||
|
||||
!(conv->mask = im_dup_imask( mask, "conv_mask" )) )
|
||||
return( NULL );
|
||||
|
||||
/* Find non-zero mask elements.
|
||||
*/
|
||||
for( i = 0; i < ne; i++ )
|
||||
for( i = 0; i < n_mask; i++ )
|
||||
if( mask->coeff[i] ) {
|
||||
conv->coeff[conv->nnz] = mask->coeff[i];
|
||||
conv->coeff_pos[conv->nnz] = i;
|
||||
@ -470,10 +544,10 @@ typedef struct {
|
||||
|
||||
int last_bpl; /* Avoid recalcing offsets, if we can */
|
||||
|
||||
/* We need an intermediate buffer to keep the result of the conv in
|
||||
* before we clip it.
|
||||
/* We need a set of intermediate buffers to keep the result of the
|
||||
* conv in before we clip it.
|
||||
*/
|
||||
void *sum;
|
||||
void **sum;
|
||||
} ConvSequence;
|
||||
|
||||
/* Free a sequence value.
|
||||
@ -484,6 +558,8 @@ conv_stop( void *vseq, void *a, void *b )
|
||||
ConvSequence *seq = (ConvSequence *) vseq;
|
||||
Conv *conv = (Conv *) b;
|
||||
|
||||
int i;
|
||||
|
||||
/* Add local under/over counts to global counts.
|
||||
*/
|
||||
conv->overflow += seq->overflow;
|
||||
@ -491,6 +567,8 @@ conv_stop( void *vseq, void *a, void *b )
|
||||
|
||||
IM_FREEF( im_region_free, seq->ir );
|
||||
|
||||
for( i = 0; i < conv->n_pass; i++ )
|
||||
IM_FREE( seq->sum[i] );
|
||||
IM_FREE( seq->sum );
|
||||
|
||||
return( 0 );
|
||||
@ -503,7 +581,9 @@ conv_start( IMAGE *out, void *a, void *b )
|
||||
{
|
||||
IMAGE *in = (IMAGE *) a;
|
||||
Conv *conv = (Conv *) b;
|
||||
|
||||
ConvSequence *seq;
|
||||
int i;
|
||||
|
||||
if( !(seq = IM_NEW( out, ConvSequence )) )
|
||||
return( NULL );
|
||||
@ -523,12 +603,27 @@ conv_start( IMAGE *out, void *a, void *b )
|
||||
seq->ir = im_region_create( in );
|
||||
seq->offsets = IM_ARRAY( out, conv->nnz, int );
|
||||
seq->pts = IM_ARRAY( out, conv->nnz, PEL * );
|
||||
seq->sum = IM_ARRAY( NULL, IM_IMAGE_N_ELEMENTS( in ), short );
|
||||
if( !seq->ir || !seq->offsets || !seq->pts || !seq->sum ) {
|
||||
if( !seq->ir || !seq->offsets || !seq->pts ) {
|
||||
conv_stop( seq, in, conv );
|
||||
return( NULL );
|
||||
}
|
||||
|
||||
if( vips_vector_get_enabled() && conv->n_pass ) {
|
||||
if( !(seq->sum = IM_ARRAY( NULL, conv->n_pass, void * )) ) {
|
||||
conv_stop( seq, in, conv );
|
||||
return( NULL );
|
||||
}
|
||||
for( i = 0; i < conv->n_pass; i++ )
|
||||
seq->sum[i] = NULL;
|
||||
|
||||
for( i = 0; i < conv->n_pass; i++ )
|
||||
if( !(seq->sum[i] = IM_ARRAY( NULL,
|
||||
IM_IMAGE_N_ELEMENTS( in ), short )) ) {
|
||||
conv_stop( seq, in, conv );
|
||||
return( NULL );
|
||||
}
|
||||
}
|
||||
|
||||
return( seq );
|
||||
}
|
||||
|
||||
@ -849,8 +944,8 @@ convvec_gen( REGION *or, void *vseq, void *a, void *b )
|
||||
int sz = IM_REGION_N_ELEMENTS( or ) * (im_iscomplex( in ) ? 2 : 1);
|
||||
|
||||
Rect s;
|
||||
int y;
|
||||
VipsExecutor convolve;
|
||||
int j, y;
|
||||
VipsExecutor convolve[MAX_PASS];
|
||||
VipsExecutor clip;
|
||||
|
||||
/* Prepare the section of the input image we need. A little larger
|
||||
@ -862,13 +957,18 @@ convvec_gen( REGION *or, void *vseq, void *a, void *b )
|
||||
if( im_prepare( ir, &s ) )
|
||||
return( -1 );
|
||||
|
||||
vips_executor_set_program( &convolve, conv->convolve, sz );
|
||||
for( j = 0; j < conv->n_pass; j++ )
|
||||
vips_executor_set_program( &convolve[j],
|
||||
conv->pass[j].vector, sz );
|
||||
vips_executor_set_program( &clip, conv->clip, sz );
|
||||
|
||||
/* Link the combiner to the intermediate buffer.
|
||||
/* Link the conv output to the intermediate buffer, and to the
|
||||
* clipper's input.
|
||||
*/
|
||||
vips_executor_set_destination( &convolve, seq->sum );
|
||||
vips_executor_set_array( &clip, conv->clip->s[0], seq->sum );
|
||||
for( j = 0; j < conv->n_pass; j++ ) {
|
||||
vips_executor_set_destination( &convolve[j], seq->sum[j] );
|
||||
vips_executor_set_array( &clip, conv->clip->s[j], seq->sum[j] );
|
||||
}
|
||||
|
||||
for( y = 0; y < r->height; y++ ) {
|
||||
#ifdef DEBUG_PIXELS
|
||||
@ -885,12 +985,17 @@ convvec_gen( REGION *or, void *vseq, void *a, void *b )
|
||||
}
|
||||
#endif /*DEBUG_PIXELS*/
|
||||
|
||||
vips_executor_set_scanline( &convolve,
|
||||
ir, r->left, r->top + y );
|
||||
vips_executor_run( &convolve );
|
||||
for( j = 0; j < conv->n_pass; j++ ) {
|
||||
vips_executor_set_scanline( &convolve[j],
|
||||
ir, r->left, r->top + y );
|
||||
vips_executor_run( &convolve[j] );
|
||||
}
|
||||
|
||||
#ifdef DEBUG_PIXELS
|
||||
printf( "before clip: %3d\n", *((signed short *) seq->sum) );
|
||||
printf( "before clip:\n" );
|
||||
for( j = 0; j < conv->n_pass; j++ )
|
||||
printf( " %d) %3d\n",
|
||||
j, ((signed short *) seq->sum[j])[0] );
|
||||
#endif /*DEBUG_PIXELS*/
|
||||
|
||||
vips_executor_set_destination( &clip,
|
||||
@ -937,7 +1042,7 @@ im_conv_raw( IMAGE *in, IMAGE *out, INTMASK *mask )
|
||||
return( -1 );
|
||||
}
|
||||
|
||||
if( conv->convolve ) {
|
||||
if( conv->n_pass ) {
|
||||
generate = convvec_gen;
|
||||
|
||||
#ifdef DEBUG
|
||||
@ -1059,12 +1164,12 @@ int
|
||||
im_convsep( IMAGE *in, IMAGE *out, INTMASK *mask )
|
||||
{
|
||||
IMAGE *t1 = im_open_local( out, "im_convsep intermediate", "p" );
|
||||
int size = mask->xsize * mask->ysize;
|
||||
int n_mask = mask->xsize * mask->ysize;
|
||||
|
||||
if( !t1 ||
|
||||
im_embed( in, t1, 1, size / 2, size / 2,
|
||||
in->Xsize + size - 1,
|
||||
in->Ysize + size - 1 ) ||
|
||||
im_embed( in, t1, 1, n_mask / 2, n_mask / 2,
|
||||
in->Xsize + n_mask - 1,
|
||||
in->Ysize + n_mask - 1 ) ||
|
||||
im_convsep_raw( t1, out, mask ) )
|
||||
return( -1 );
|
||||
|
||||
|
@ -372,4 +372,3 @@ vips_executor_run( VipsExecutor *executor )
|
||||
orc_executor_run( &executor->executor );
|
||||
#endif /*HAVE_ORC*/
|
||||
}
|
||||
|
||||
|
@ -77,7 +77,7 @@ typedef enum {
|
||||
/* We can't run more than this many passes. Larger than this and we
|
||||
* fall back to C.
|
||||
*/
|
||||
#define MAX_PASSES (10)
|
||||
#define MAX_PASS (10)
|
||||
|
||||
/* A pass with a vector.
|
||||
*/
|
||||
@ -103,7 +103,7 @@ typedef struct {
|
||||
/* The passes we generate for this mask.
|
||||
*/
|
||||
int n_pass;
|
||||
Pass pass[MAX_PASSES];
|
||||
Pass pass[MAX_PASS];
|
||||
} Morph;
|
||||
|
||||
static void
|
||||
@ -262,7 +262,7 @@ pass_compile( Morph *morph )
|
||||
|
||||
/* Allocate space for another pass.
|
||||
*/
|
||||
if( morph->n_pass == MAX_PASSES )
|
||||
if( morph->n_pass == MAX_PASS )
|
||||
return( -1 );
|
||||
pass = &morph->pass[morph->n_pass];
|
||||
morph->n_pass += 1;
|
||||
@ -327,7 +327,7 @@ morph_new( IMAGE *in, IMAGE *out, INTMASK *mask, MorphOp op )
|
||||
morph->op = op;
|
||||
|
||||
morph->n_pass = 0;
|
||||
for( i = 0; i < MAX_PASSES; i++ )
|
||||
for( i = 0; i < MAX_PASS; i++ )
|
||||
morph->pass[i].vector = NULL;
|
||||
|
||||
if( im_add_close_callback( out,
|
||||
@ -655,7 +655,7 @@ morph_vector_gen( REGION *or, void *vseq, void *a, void *b )
|
||||
|
||||
Rect s;
|
||||
int y, j;
|
||||
VipsExecutor executor[MAX_PASSES];
|
||||
VipsExecutor executor[MAX_PASS];
|
||||
|
||||
/* Prepare the section of the input image we need. A little larger
|
||||
* than the section of the output image we are producing.
|
||||
|
Loading…
Reference in New Issue
Block a user