From 008fd217280257cf899ef4615f835a5b97e28ed1 Mon Sep 17 00:00:00 2001 From: John Cupitt Date: Sat, 14 Oct 2017 17:03:48 +0100 Subject: [PATCH] all done --- ChangeLog | 1 + libvips/convolution/convi.c | 60 ++++++++++++++++++++----------------- 2 files changed, 34 insertions(+), 27 deletions(-) diff --git a/ChangeLog b/ChangeLog index 02a1d43e..fe9fb9a5 100644 --- a/ChangeLog +++ b/ChangeLog @@ -27,6 +27,7 @@ - better gobject-introspection annotations, thanks astavale - vips_image_write() severs all links between images, when it can ... thanks Warren and Nakilon +- vector path for convolution is more accurate and can handle larger masks 29/8/17 started 8.5.9 - make --fail stop jpeg read on any libjpeg warning, thanks @mceachen diff --git a/libvips/convolution/convi.c b/libvips/convolution/convi.c index 68ba2001..2045ede7 100644 --- a/libvips/convolution/convi.c +++ b/libvips/convolution/convi.c @@ -76,6 +76,8 @@ * - remove pts for a small speedup * 12/10/17 * - fix leak of vectors, thanks MHeimbuc + * 14/10/17 + * - switch to half-float for vector path */ /* @@ -108,8 +110,8 @@ /* #define DEBUG #define DEBUG_PIXELS - */ #define DEBUG_COMPILE + */ #ifdef HAVE_CONFIG_H #include @@ -124,13 +126,6 @@ #include "pconvolution.h" -/* We do the 8-bit vector path with fixed-point arithmetic. We use 3.5 bits - * for the mask coefficients, so our range is -4 to +3.99, after using scale - * on the mask. - */ -#define FIXED_BITS (5) -#define FIXED_SCALE (1 << FIXED_BITS) - /* Larger than this and we fall back to C. */ #define MAX_PASS (20) @@ -154,6 +149,7 @@ typedef struct { /* An int version of M. */ VipsImage *iM; + int n_point; /* w * h for our matrix */ /* We make a smaller version of the mask with the zeros squeezed out. */ @@ -161,7 +157,7 @@ typedef struct { int *coeff; /* Array of non-zero mask coefficients */ int *coeff_pos; /* Index of each nnz element in mask->coeff */ - /* And a half float version for a vector path. mant has the signed + /* And a half float version for the vector path. mant has the signed * 8-bit mantissas in [-1, +1), sexp has the exponent shift after the * mul and before the add, and exp has the final exponent shift before * write-back. @@ -169,7 +165,6 @@ typedef struct { int *mant; int sexp; int exp; - int n_point; /* Number of points in fixed-point array */ /* The set of passes we need for this mask. */ @@ -180,10 +175,6 @@ typedef struct { */ int r; VipsVector *vector; - - /* Remove later. - */ - int *fixed; } VipsConvi; typedef VipsConvolutionClass VipsConviClass; @@ -353,11 +344,13 @@ vips_convi_compile_section( VipsConvi *convi, VipsImage *in, Pass *pass ) char source[256]; char off[256]; + char rnd[256]; + char sexp[256]; char coeff[256]; /* Exclude zero elements. */ - if( !convi->fixed[i] ) + if( !convi->mant[i] ) continue; /* The source. sl0 is the first scanline in the mask. @@ -379,9 +372,16 @@ vips_convi_compile_section( VipsConvi *convi, VipsImage *in, Pass *pass ) * of the image and coefficient are interesting, so we can take * the bottom half of a 16x16->32 multiply. */ - CONST( coeff, convi->fixed[i], 2 ); + CONST( coeff, convi->mant[i], 2 ); ASM3( "mullw", "value", "value", coeff ); + /* Shift right before add to prevent overflow on large masks. + */ + CONST( sexp, convi->sexp, 2 ); + CONST( rnd, 1 << (convi->sexp - 1), 2 ); + ASM3( "addw", "value", "value", rnd ); + ASM3( "shrsw", "value", "value", sexp ); + /* We accumulate the signed 16-bit result in sum. Saturated * add. */ @@ -420,8 +420,8 @@ vips_convi_compile_clip( VipsConvi *convi ) int offset = VIPS_RINT( vips_image_get_offset( M ) ); VipsVector *v; - char c16[256]; - char c5[256]; + char rnd[256]; + char exp[256]; char c0[256]; char c255[256]; char off[256]; @@ -436,10 +436,10 @@ vips_convi_compile_clip( VipsConvi *convi ) */ TEMP( "value", 2 ); - CONST( c16, 16, 2 ); - ASM3( "addw", "value", "r", c16 ); - CONST( c5, 5, 2 ); - ASM3( "shrsw", "value", "value", c5 ); + CONST( rnd, 1 << (convi->exp - 1), 2 ); + ASM3( "addw", "value", "r", rnd ); + CONST( exp, convi->exp, 2 ); + ASM3( "shrsw", "value", "value", exp ); CONST( off, offset, 2 ); ASM3( "addw", "value", "value", off ); @@ -852,8 +852,7 @@ vips__image_intize( VipsImage *in, VipsImage **out ) static int vips_convi_intize( VipsConvi *convi, VipsImage *M ) { - int n_point = M->Xsize * M->Ysize; - + int n_point; VipsImage *t; double scale; double *scaled; @@ -862,6 +861,10 @@ vips_convi_intize( VipsConvi *convi, VipsImage *M ) int shift; int i; + n_point = M->Xsize * M->Ysize; + + g_assert( convi->n_point == n_point ); + if( vips_check_matrix( "vips2imask", M, &t ) ) return( -1 ); @@ -1003,7 +1006,7 @@ vips_convi_build( VipsObject *object ) in = convolution->in; M = convolution->M; - convi->n_point = n_point = M->Xsize * M->Ysize; + convi->n_point = M->Xsize * M->Ysize; if( vips_embed( in, &t[0], M->Xsize / 2, M->Ysize / 2, @@ -1042,12 +1045,15 @@ vips_convi_build( VipsObject *object ) convi->iM = M = t[1]; coeff = VIPS_MATRIX( M, 0, 0 ); + n_point = M->Xsize * M->Ysize; if( !(convi->coeff = VIPS_ARRAY( object, n_point, int )) || - !(convi->coeff_pos = VIPS_ARRAY( object, n_point, int )) ) + !(convi->coeff_pos = + VIPS_ARRAY( object, n_point, int )) ) return( -1 ); /* Squeeze out zero mask elements. */ + convi->nnz = 0; for( i = 0; i < n_point; i++ ) if( coeff[i] ) { convi->coeff[convi->nnz] = coeff[i]; @@ -1127,7 +1133,7 @@ vips_convi_init( VipsConvi *convi ) * The output image always has the same #VipsBandFormat as the input image. * * For #VIPS_FORMAT_UCHAR images, vips_convi() uses a fast vector path based on - * fixed-point arithmetic. This can produce slightly different results. + * half-float arithmetic. This can produce slightly different results. * Disable the vector path with `--vips-novector` or `VIPS_NOVECTOR` or * vips_vector_set_enabled(). *