From 4d10bd12f9466495acb45818c2d970d72f952533 Mon Sep 17 00:00:00 2001 From: John Cupitt Date: Thu, 10 Mar 2016 10:24:44 +0000 Subject: [PATCH] still trying to get reducevl3 to vectorise --- libvips/resample/presample.h | 8 ++- libvips/resample/reducehl3.cpp | 58 +++++++------------ libvips/resample/reducevl3.cpp | 101 +++++++++++++++++++++------------ libvips/resample/templates.h | 16 ++++++ 4 files changed, 107 insertions(+), 76 deletions(-) diff --git a/libvips/resample/presample.h b/libvips/resample/presample.h index a8f587f4..33f1e413 100644 --- a/libvips/resample/presample.h +++ b/libvips/resample/presample.h @@ -65,8 +65,12 @@ typedef struct _VipsResampleClass { GType vips_resample_get_type( void ); -int vips_reducehl3_get_points( VipsKernel kernel ); -void vips_reducehl3_make_mask( VipsKernel kernel, double x, double *c ); +/* The max size of the vector we use. + */ +#define MAX_POINTS (6) + +int vips_reduce_get_points( VipsKernel kernel ); +void vips_reduce_make_mask( VipsKernel kernel, double x, double *c ); #ifdef __cplusplus } diff --git a/libvips/resample/reducehl3.cpp b/libvips/resample/reducehl3.cpp index 3bd2f15e..55804c27 100644 --- a/libvips/resample/reducehl3.cpp +++ b/libvips/resample/reducehl3.cpp @@ -64,10 +64,6 @@ * 1D resampling kernels. */ -/* The max size of the vector we use. - */ -#define MAX_POINTS (6) - typedef struct _VipsReducehl3 { VipsResample parent_instance; @@ -101,7 +97,7 @@ G_DEFINE_TYPE( VipsReducehl3, vips_reducehl3, VIPS_TYPE_RESAMPLE ); /* Get n points. */ int -vips_reducehl3_get_points( VipsKernel kernel ) +vips_reduce_get_points( VipsKernel kernel ) { switch( kernel ) { case VIPS_KERNEL_NEAREST: @@ -128,7 +124,7 @@ vips_reducehl3_get_points( VipsKernel kernel ) /* Calculate a mask. */ void -vips_reducehl3_make_mask( VipsKernel kernel, double x, double *c ) +vips_reduce_make_mask( VipsKernel kernel, double x, double *c ) { switch( kernel ) { case VIPS_KERNEL_NEAREST: @@ -136,8 +132,8 @@ vips_reducehl3_make_mask( VipsKernel kernel, double x, double *c ) break; case VIPS_KERNEL_LINEAR: - c[0] = x; - c[1] = 1.0 - x; + c[0] = 1.0 - x; + c[1] = x; break; case VIPS_KERNEL_CUBIC: @@ -192,22 +188,6 @@ reducehl3_unsigned_uint8_4tab( VipsPel *out, const VipsPel *in, } } -/* Our inner loop. Operate on elements of size T, gather results in an - * intermediate of type IT. - */ -template -static IT -reducehl3_sum( const T * restrict in, int bands, const IT * restrict c, int n ) -{ - IT sum; - - sum = 0; - for( int i = 0; i < n; i++ ) - sum += c[i] * in[i * bands]; - - return( sum ); -} - template static void inline reducehl3_unsigned_int_tab( VipsReducehl3 *reducehl3, @@ -216,11 +196,12 @@ reducehl3_unsigned_int_tab( VipsReducehl3 *reducehl3, { T* restrict out = (T *) pout; const T* restrict in = (T *) pin; + const int n = reducehl3->n_points; for( int z = 0; z < bands; z++ ) { int sum; - sum = reducehl3_sum(in, bands, cx, reducehl3->n_points); + sum = reduce_sum( in, bands, cx, n ); sum = unsigned_fixed_round( sum ); sum = VIPS_CLIP( 0, sum, max_value ); @@ -238,11 +219,12 @@ reducehl3_signed_int_tab( VipsReducehl3 *reducehl3, { T* restrict out = (T *) pout; const T* restrict in = (T *) pin; + const int n = reducehl3->n_points; for( int z = 0; z < bands; z++ ) { int sum; - sum = reducehl3_sum(in, bands, cx, reducehl3->n_points); + sum = reduce_sum( in, bands, cx, n ); sum = signed_fixed_round( sum ); sum = VIPS_CLIP( min_value, sum, max_value ); @@ -262,10 +244,10 @@ reducehl3_float_tab( VipsReducehl3 *reducehl3, { T* restrict out = (T *) pout; const T* restrict in = (T *) pin; + const int n = reducehl3->n_points; for( int z = 0; z < bands; z++ ) { - out[z] = reducehl3_sum - (in, bands, cx, reducehl3->n_points); + out[z] = reduce_sum( in, bands, cx, n ); in += 1; } } @@ -281,12 +263,12 @@ reducehl3_unsigned_int32_tab( VipsReducehl3 *reducehl3, { T* restrict out = (T *) pout; const T* restrict in = (T *) pin; + const int n = reducehl3->n_points; for( int z = 0; z < bands; z++ ) { double sum; - sum = reducehl3_sum - (in, bands, cx, reducehl3->n_points); + sum = reduce_sum( in, bands, cx, n ); out[z] = VIPS_CLIP( 0, sum, max_value ); in += 1; @@ -301,12 +283,12 @@ reducehl3_signed_int32_tab( VipsReducehl3 *reducehl3, { T* restrict out = (T *) pout; const T* restrict in = (T *) pin; + const int n = reducehl3->n_points; for( int z = 0; z < bands; z++ ) { double sum; - sum = reducehl3_sum - (in, bands, cx, reducehl3->n_points); + sum = reduce_sum( in, bands, cx, n ); sum = VIPS_CLIP( min_value, sum, max_value ); out[z] = sum; @@ -324,14 +306,14 @@ reducehl3_notab( VipsReducehl3 *reducehl3, { T* restrict out = (T *) pout; const T* restrict in = (T *) pin; + const int n = reducehl3->n_points; double cx[MAX_POINTS]; - vips_reducehl3_make_mask( reducehl3->kernel, x, cx ); + vips_reduce_make_mask( reducehl3->kernel, x, cx ); for( int z = 0; z < bands; z++ ) { - out[z] = reducehl3_sum - (in, bands, cx, reducehl3->n_points); + out[z] = reduce_sum( in, bands, cx, n ); in += 1; } @@ -488,9 +470,9 @@ vips_reducehl3_build( VipsObject *object ) /* Build the tables of pre-computed coefficients. */ - reducehl3->n_points = vips_reducehl3_get_points( reducehl3->kernel ); + reducehl3->n_points = vips_reduce_get_points( reducehl3->kernel ); for( int x = 0; x < VIPS_TRANSFORM_SCALE + 1; x++ ) { - vips_reducehl3_make_mask( reducehl3->kernel, + vips_reduce_make_mask( reducehl3->kernel, (float) x / VIPS_TRANSFORM_SCALE, reducehl3->matrixf[x] ); @@ -575,7 +557,7 @@ vips_reducehl3_class_init( VipsReducehl3Class *reducehl3_class ) VIPS_ARG_ENUM( reducehl3_class, "kernel", 3, _( "Kernel" ), - _( "Resamling kernel" ), + _( "Resampling kernel" ), VIPS_ARGUMENT_OPTIONAL_INPUT, G_STRUCT_OFFSET( VipsReducehl3, kernel ), VIPS_TYPE_KERNEL, VIPS_KERNEL_CUBIC ); diff --git a/libvips/resample/reducevl3.cpp b/libvips/resample/reducevl3.cpp index d291167d..e70ee469 100644 --- a/libvips/resample/reducevl3.cpp +++ b/libvips/resample/reducevl3.cpp @@ -2,6 +2,8 @@ * * 29/1/16 * - from shrinkv.c + * 10/3/16 + * - add other kernels */ /* @@ -51,25 +53,34 @@ #include "presample.h" #include "templates.h" +/* The max size of the vector we use. + */ +#define MAX_POINTS (6) + typedef struct _VipsReducevl3 { VipsResample parent_instance; double yshrink; /* Shrink factor */ + /* The thing we use to make the kernel. + */ + VipsKernel kernel; + + /* Number of points in kernel. + */ + int n_points; + + /* Precalculated interpolation matrices. int (used for pel + * sizes up to short), and double (for all others). We go to + * scale + 1 so we can round-to-nearest safely. + */ + int matrixi[VIPS_TRANSFORM_SCALE + 1][MAX_POINTS]; + double matrixf[VIPS_TRANSFORM_SCALE + 1][MAX_POINTS]; + } VipsReducevl3; typedef VipsResampleClass VipsReducevl3Class; -/* Precalculated interpolation matrices. int (used for pel - * sizes up to short), and double (for all others). We go to - * scale + 1 so we can round-to-nearest safely. - */ - -const int n_points = 6; - -static int vips_reducevl3_matrixi[VIPS_TRANSFORM_SCALE + 1][n_points]; -static double vips_reducevl3_matrixf[VIPS_TRANSFORM_SCALE + 1][n_points]; - /* We need C linkage for this. */ extern "C" { @@ -78,29 +89,33 @@ G_DEFINE_TYPE( VipsReducevl3, vips_reducevl3, VIPS_TYPE_RESAMPLE ); template static void inline -reducevl3_unsigned_int_tab( VipsPel *pout, const VipsPel *pin, +reducevl3_unsigned_int_tab( VipsReducevl3 *reducevl3, + VipsPel *pout, const VipsPel *pin, const int ne, const int lskip, const int * restrict cy ) { T* restrict out = (T *) pout; const T* restrict in = (T *) pin; - + const int n = reducevl3->n_points; const int l1 = lskip / sizeof( T ); + const int round_by = VIPS_INTERPOLATE_SCALE >> 1; for( int z = 0; z < ne; z++ ) { int sum; - sum = 0; - for( int i = 0; i < n_points; i++ ) - sum += cy[i] * in[i * l1]; + sum = 0; + for( int i = 0; i < n; i++ ) + sum += cy[i] * in[z + i * l1]; - sum = unsigned_fixed_round( sum ); + sum = (sum + round_by) >> VIPS_INTERPOLATE_SHIFT; - sum = VIPS_CLIP( 0, sum, max_value ); + //sum = reduce_sum( in, l1, cy, n ); + //sum = unsigned_fixed_round( sum ); + //sum = VIPS_CLIP( 0, sum, max_value ); out[z] = sum; - in += 1; + //in += 1; } } @@ -129,7 +144,7 @@ vips_reducevl3_gen( VipsRegion *out_region, void *seq, s.left = r->left; s.top = r->top * reducevl3->yshrink; s.width = r->width; - s.height = r->height * reducevl3->yshrink + n_points; + s.height = r->height * reducevl3->yshrink + reducevl3->n_points; if( vips_region_prepare( ir, &s ) ) return( -1 ); @@ -142,14 +157,15 @@ vips_reducevl3_gen( VipsRegion *out_region, void *seq, const int sy = Y * VIPS_TRANSFORM_SCALE * 2; const int siy = sy & (VIPS_TRANSFORM_SCALE * 2 - 1); const int ty = (siy + 1) >> 1; - const int *cyi = vips_reducevl3_matrixi[ty]; - const double *cyf = vips_reducevl3_matrixf[ty]; + const int *cyi = reducevl3->matrixi[ty]; + const double *cyf = reducevl3->matrixf[ty]; const int lskip = VIPS_REGION_LSKIP( ir ); switch( in->BandFmt ) { case VIPS_FORMAT_UCHAR: reducevl3_unsigned_int_tab ( + reducevl3, q, p, ne, lskip, cyi ); break; @@ -191,6 +207,19 @@ vips_reducevl3_build( VipsObject *object ) if( reducevl3->yshrink == 1 ) return( vips_image_write( in, resample->out ) ); + /* Build the tables of pre-computed coefficients. + */ + reducevl3->n_points = vips_reduce_get_points( reducevl3->kernel ); + for( int y = 0; y < VIPS_TRANSFORM_SCALE + 1; y++ ) { + vips_reduce_make_mask( reducevl3->kernel, + (float) y / VIPS_TRANSFORM_SCALE, + reducevl3->matrixf[y] ); + + for( int i = 0; i < reducevl3->n_points; i++ ) + reducevl3->matrixi[y][i] = reducevl3->matrixf[y][i] * + VIPS_INTERPOLATE_SCALE; + } + /* Unpack for processing. */ if( vips_image_decode( in, &t[0] ) ) @@ -200,8 +229,8 @@ vips_reducevl3_build( VipsObject *object ) /* Add new pixels around the input so we can interpolate at the edges. */ if( vips_embed( in, &t[1], - 0, n_points / 2, - in->Xsize, in->Ysize + n_points - 1, + 0, reducevl3->n_points / 2, + in->Xsize, in->Ysize + reducevl3->n_points - 1, "extend", VIPS_EXTEND_COPY, NULL ) ) return( -1 ); @@ -217,7 +246,7 @@ vips_reducevl3_build( VipsObject *object ) * example, vipsthumbnail knows the true reduce factor (including the * fractional part), we just see the integer part here. */ - resample->out->Ysize = (in->Ysize - n_points + 1) / reducevl3->yshrink; + resample->out->Ysize = (in->Ysize - reducevl3->n_points + 1) / reducevl3->yshrink; if( resample->out->Ysize <= 0 ) { vips_error( object_class->nickname, "%s", _( "image has shrunk to nothing" ) ); @@ -264,24 +293,20 @@ vips_reducevl3_class_init( VipsReducevl3Class *reducevl3_class ) G_STRUCT_OFFSET( VipsReducevl3, yshrink ), 1, 1000000, 1 ); - /* Build the tables of pre-computed coefficients. - */ - for( int y = 0; y < VIPS_TRANSFORM_SCALE + 1; y++ ) { - calculate_coefficients_lanczos( 3, - (float) y / VIPS_TRANSFORM_SCALE, - vips_reducevl3_matrixf[y] ); + VIPS_ARG_ENUM( reducevl3_class, "kernel", 3, + _( "Kernel" ), + _( "Resampling kernel" ), + VIPS_ARGUMENT_OPTIONAL_INPUT, + G_STRUCT_OFFSET( VipsReducevl3, kernel ), + VIPS_TYPE_KERNEL, VIPS_KERNEL_CUBIC ); - for( int i = 0; i < n_points; i++ ) - vips_reducevl3_matrixi[y][i] = - vips_reducevl3_matrixf[y][i] * - VIPS_INTERPOLATE_SCALE; - } } static void vips_reducevl3_init( VipsReducevl3 *reducevl3 ) { + reducevl3->kernel = VIPS_KERNEL_CUBIC; } /** @@ -291,8 +316,12 @@ vips_reducevl3_init( VipsReducevl3 *reducevl3 ) * @yshrink: horizontal reduce * @...: %NULL-terminated list of optional named arguments * + * Optional arguments: + * + * @kernel: #VipsKernel to use to interpolate (default: cubic) + * * Reduce @in vertically by a float factor. The pixels in @out are - * interpolated with a 1D cubic mask. This operation will not work well for + * interpolated with a 1D mask. This operation will not work well for * a reduction of more than a factor of two. * * This is a very low-level operation: see vips_resize() for a more diff --git a/libvips/resample/templates.h b/libvips/resample/templates.h index e6087236..50fd6d51 100644 --- a/libvips/resample/templates.h +++ b/libvips/resample/templates.h @@ -339,3 +339,19 @@ calculate_coefficients_lanczos( int a, const double x, double *c ) c[i] = l; } } + +/* Our inner loop for resampling with a convolution. Operate on elements of + * size T, gather results in an intermediate of type IT. + */ +template +static IT +reduce_sum( const T * restrict in, int stride, const IT * restrict c, int n ) +{ + IT sum; + + sum = 0; + for( int i = 0; i < n; i++ ) + sum += c[i] * in[i * stride]; + + return( sum ); +}