From 21fce2ab9e576d63481af88a5998462fb4dae208 Mon Sep 17 00:00:00 2001 From: John Cupitt Date: Thu, 9 Jun 2011 11:39:31 +0100 Subject: [PATCH] im_aconv() works for a large 2d mask needs some more tuning, we should be able to speed it up still --- libvips/convolution/im_aconv.c | 152 +++++++++++++++++++++------------ 1 file changed, 98 insertions(+), 54 deletions(-) diff --git a/libvips/convolution/im_aconv.c b/libvips/convolution/im_aconv.c index d9c8758e..39dfdfa1 100644 --- a/libvips/convolution/im_aconv.c +++ b/libvips/convolution/im_aconv.c @@ -50,18 +50,37 @@ TODO - - tried a 201x201 mask, sigmal 44.6, minamp 0.1, does not seem to - read the whole mask? we get only 37638 elements, max and min are - messed up +timing: + +$ time vips im_conv_f img_0075.jpg x2.v g2d201.con +real 11m58.769s +user 22m46.390s +sys 0m3.270s + +$ time vips im_aconv img_0075.jpg x.v g2d201.con 10 10 +boxes_new: min = 0, max = 1 +boxes_new: depth = 0.1, n_layers = 10 +boxes_new: generated 1130 boxes +boxes_new: clustering with thresh 10 ... +boxes_new: renumbering ... +boxes_new: after renumbering, 14 boxes remain +real 0m34.377s +user 1m0.440s +sys 0m0.370s + +$ vips im_subtract x.v x2.v diff.v +$ vips im_abs diff.v abs.v +$ vips im_max abs.v +2.70833 + + - can we use rolling averages for the vertical pass? + + - add more bandfmt - are we handling mask offset correctly? */ -/* Show sample pixels as they are transformed. -#define DEBUG_PIXELS - */ - /* */ #define DEBUG @@ -87,7 +106,7 @@ /* Maximum number of boxes we can break the mask into. */ -#define MAX_LINES (1000) +#define MAX_LINES (10000) /* Get an (x,y) value from a mask. */ @@ -181,6 +200,8 @@ boxes_end( Boxes *boxes, int x, int y, int factor ) static int boxes_distance( Boxes *boxes, int a, int b ) { + g_assert( boxes->weight[a] > 0 && boxes->weight[b] > 0 ); + return( abs( boxes->start[a] - boxes->start[b] ) + abs( boxes->end[a] - boxes->end[b] ) ); } @@ -264,26 +285,61 @@ boxes_renumber( Boxes *boxes ) { int i, j; - for( i = 0; i < boxes->n_hlines; i++ ) { - if( boxes->weight[i] == 0 ) { - /* We move hlines i + 1 down, so we need to adjust all - * band[] refs to match. - */ - for( j = 0; j < boxes->n_vlines; j++ ) - if( boxes->band[j] <= i ) - boxes->band[j] -= 1; - - for( j = i; j < boxes->n_hlines; j++ ) { - boxes->start[j] = boxes->start[j + 1]; - boxes->end[j] = boxes->end[j + 1]; - boxes->weight[j] = boxes->weight[j + 1]; - } - - boxes->n_hlines -= 1; + /* Loop for all zero-weight hlines. + */ + for( i = 0; i < boxes->n_hlines; ) { + if( boxes->weight[i] > 0 ) { + i++; + continue; } + + /* We move hlines i + 1 down, so we need to adjust all + * band[] refs to match. + */ + for( j = 0; j < boxes->n_vlines; j++ ) + if( boxes->band[j] > i ) + boxes->band[j] -= 1; + + for( j = i; j < boxes->n_hlines; j++ ) { + boxes->start[j] = boxes->start[j + 1]; + boxes->end[j] = boxes->end[j + 1]; + boxes->weight[j] = boxes->weight[j + 1]; + } + + boxes->n_hlines -= 1; } } +#ifdef DEBUG +static void +boxes_print( Boxes *boxes ) +{ + int x, y; + + printf( "lines:\n" ); + printf( " n b r f w\n" ); + for( y = 0; y < boxes->n_vlines; y++ ) { + int b = boxes->band[y]; + + printf( "%3d %3d %3d %2d %2d ", + y, b, + boxes->row[y], boxes->factor[y], + boxes->weight[b] ); + for( x = 0; x < 50; x++ ) { + int rx = x * (boxes->mask->xsize + 1) / 50; + + if( rx >= boxes->start[b] && rx < boxes->end[b] ) + printf( "#" ); + else + printf( " " ); + } + printf( " %3d .. %3d\n", boxes->start[b], boxes->end[b] ); + } + printf( "area = %d\n", boxes->area ); + printf( "rounding = %d\n", boxes->rounding ); +} +#endif /*DEBUG*/ + /* Break a mask into boxes. */ static Boxes * @@ -399,13 +455,13 @@ boxes_new( IMAGE *in, IMAGE *out, DOUBLEMASK *mask, int n_layers, int cluster ) VIPS_DEBUG_MSG( "boxes_new: generated %d boxes\n", boxes->n_hlines ); - VIPS_DEBUG_MSG( "boxes_new: clustering with thresh %d ...\n", cluster ); while( boxes_cluster( boxes, cluster ) ) ; + VIPS_DEBUG_MSG( "boxes_new: renumbering ...\n" ); boxes_renumber( boxes ); - VIPS_DEBUG_MSG( "boxes_new: after clustering, %d boxes remain\n", + VIPS_DEBUG_MSG( "boxes_new: after renumbering, %d boxes remain\n", boxes->n_hlines ); /* Find the area of the lines. @@ -438,27 +494,17 @@ boxes_new( IMAGE *in, IMAGE *out, DOUBLEMASK *mask, int n_layers, int cluster ) boxes->area = rint( sum * boxes->area / mask->scale ); boxes->rounding = (boxes->area + 1) / 2 + mask->offset * boxes->area; - /* ASCII-art layer drawing. +#ifdef DEBUG + boxes_print( boxes ); +#endif /*DEBUG*/ + + /* With 512x512 tiles, each hline requires 3mb of intermediate per + * thread ... 300 lines is about a gb per thread, ouch. */ - printf( "lines:\n" ); - printf( " n b r f\n" ); - for( y = 0; y < boxes->n_vlines; y++ ) { - int b = boxes->band[y]; - - printf( "%3d %3d %3d %2d ", - y, b, boxes->row[y], boxes->factor[y] ); - for( x = 0; x < 50; x++ ) { - int rx = x * (mask->xsize + 1) / 50; - - if( rx >= boxes->start[b] && rx < boxes->end[b] ) - printf( "#" ); - else - printf( " " ); - } - printf( " %3d .. %3d\n", boxes->start[b], boxes->end[b] ); + if( boxes->n_hlines > 150 ) { + im_error( "im_aconv", "%s", _( "mask too complex" ) ); + return( NULL ); } - printf( "area = %d\n", boxes->area ); - printf( "rounding = %d\n", boxes->rounding ); return( boxes ); } @@ -768,9 +814,6 @@ aconv_vgenerate( REGION *or, void *vseq, void *a, void *b ) if( im_prepare( ir, &s ) ) return( -1 ); - /* Stride can be different for the vertical case, keep this here for - * ease of direction change. - */ istride = IM_REGION_LSKIP( ir ) / IM_IMAGE_SIZEOF_ELEMENT( in ); ostride = IM_REGION_LSKIP( or ) / @@ -789,23 +832,24 @@ aconv_vgenerate( REGION *or, void *vseq, void *a, void *b ) switch( boxes->in->BandFmt ) { case IM_BANDFMT_UCHAR: - for( y = 0; y < r->height; y++ ) { + for( x = 0; x < sz; x++ ) { int *p; PEL *q; int sum; - p = (int *) IM_REGION_ADDR( ir, r->left, r->top + y ); - q = (PEL *) IM_REGION_ADDR( or, r->left, r->top + y ); + p = x * boxes->n_hlines + + (int *) IM_REGION_ADDR( ir, r->left, r->top ); + q = x + (PEL *) IM_REGION_ADDR( or, r->left, r->top ); - for( x = 0; x < sz; x++ ) { + for( y = 0; y < r->height; y++ ) { sum = 0; for( z = 0; z < n_vlines; z++ ) sum += boxes->factor[z] * p[seq->start[z]]; + p += istride; sum = (sum + boxes->rounding) / boxes->area; CLIP_UCHAR( sum ); *q = sum; - q += 1; - p += boxes->n_hlines; + q += ostride; } }