switch to 3.5 bits of precision for vec convi

lets us execute

3 3 8 0
-1 -1 -1
-1 16 -1
-1 -1 -1

on the vector unit
This commit is contained in:
John Cupitt 2016-09-12 14:48:23 +01:00
parent 707235ae61
commit b96335365e
1 changed files with 17 additions and 24 deletions

View File

@ -120,11 +120,11 @@
#include "pconvolution.h" #include "pconvolution.h"
/* We do the 8-bit vector path with fixed-point arithmetic. We use 2.6 bits /* We do the 8-bit vector path with fixed-point arithmetic. We use 3.5 bits
* for the mask coefficients, so our range is -2 to +1.99, after using scale * for the mask coefficients, so our range is -4 to +3.99, after using scale
* on the mask. * on the mask.
*/ */
#define FIXED_BITS (6) #define FIXED_BITS (5)
#define FIXED_SCALE (1 << FIXED_BITS) #define FIXED_SCALE (1 << FIXED_BITS)
/* Larger than this and we fall back to C. /* Larger than this and we fall back to C.
@ -352,9 +352,7 @@ vips_convi_compile_section( VipsConvi *convi, VipsImage *in, Pass *pass )
ASM2( "convubw", "value", "valueb" ); ASM2( "convubw", "value", "valueb" );
} }
/* Mask coefficients are 2.6 bits fixed point, so -2 to +1.99. /* We need a signed multiply, so the image pixel needs to
*
* We need a signed multiply, so the image pixel needs to
* become a signed 16-bit value. We know only the bottom 8 bits * become a signed 16-bit value. We know only the bottom 8 bits
* of the image and coefficient are interesting, so we can take * of the image and coefficient are interesting, so we can take
* the bottom half of a 16x16->32 multiply. * the bottom half of a 16x16->32 multiply.
@ -400,8 +398,8 @@ vips_convi_compile_clip( VipsConvi *convi )
int offset = VIPS_RINT( vips_image_get_offset( M ) ); int offset = VIPS_RINT( vips_image_get_offset( M ) );
VipsVector *v; VipsVector *v;
char c32[256]; char c16[256];
char c6[256]; char c5[256];
char c0[256]; char c0[256];
char c255[256]; char c255[256];
char off[256]; char off[256];
@ -416,10 +414,10 @@ vips_convi_compile_clip( VipsConvi *convi )
*/ */
TEMP( "value", 2 ); TEMP( "value", 2 );
CONST( c32, 32, 2 ); CONST( c16, 16, 2 );
ASM3( "addw", "value", "r", c32 ); ASM3( "addw", "value", "r", c16 );
CONST( c6, 6, 2 ); CONST( c5, 5, 2 );
ASM3( "shrsw", "value", "value", c6 ); ASM3( "shrsw", "value", "value", c5 );
CONST( off, offset, 2 ); CONST( off, offset, 2 );
ASM3( "addw", "value", "value", off ); ASM3( "addw", "value", "value", off );
@ -851,19 +849,17 @@ intize_to_fixed_point( VipsImage *in, int *out )
scaled[i] = VIPS_MATRIX( t, 0, 0 )[i] / scale; scaled[i] = VIPS_MATRIX( t, 0, 0 )[i] / scale;
g_object_unref( t ); g_object_unref( t );
/* The scaled mask must fit in 2.6 bits, so we can handle -2 to +1.99 /* The scaled mask must fit in 3.5 bits, so we can handle -4 to +3.99
*/ */
for( i = 0; i < ne; i++ ) for( i = 0; i < ne; i++ )
if( scaled[i] >= 2.0 || if( scaled[i] >= 4.0 ||
scaled[i] < -2 ) { scaled[i] < -4 ) {
#ifdef DEBUG_COMPILE vips_info( "intize_to_fixed_point",
printf( "intize_to_fixed_point: out of range\n" ); "out of range for vector path" );
#endif /*DEBUG_COMPILE*/
return( -1 ); return( -1 );
} }
/* The smallest coefficient we can manage is 1/64th, we'll just turn /* The smallest coefficient we can manage is 1/32nd, we'll just turn
* that into zero. * that into zero.
* *
* Find the total error we'll get by rounding down to zero and bail if * Find the total error we'll get by rounding down to zero and bail if
@ -878,10 +874,7 @@ intize_to_fixed_point( VipsImage *in, int *out )
/* 0.1 is a 10% error. /* 0.1 is a 10% error.
*/ */
if( total_error > 0.1 ) { if( total_error > 0.1 ) {
#ifdef DEBUG_COMPILE vips_info( "intize_to_fixed_point", "too many underflows" );
printf( "intize_to_fixed_point: too many underflows\n" );
#endif /*DEBUG_COMPILE*/
return( -1 ); return( -1 );
} }