From 4d10bd12f9466495acb45818c2d970d72f952533 Mon Sep 17 00:00:00 2001
From: John Cupitt <jcupitt@gmail.com>
Date: Thu, 10 Mar 2016 10:24:44 +0000
Subject: [PATCH] still trying to get reducevl3 to vectorise

---
 libvips/resample/presample.h   |   8 ++-
 libvips/resample/reducehl3.cpp |  58 +++++++------------
 libvips/resample/reducevl3.cpp | 101 +++++++++++++++++++++------------
 libvips/resample/templates.h   |  16 ++++++
 4 files changed, 107 insertions(+), 76 deletions(-)
diff --git a/libvips/resample/presample.h b/libvips/resample/presample.h
index a8f587f4..33f1e413 100644
--- a/libvips/resample/presample.h
+++ b/libvips/resample/presample.h
@@ -65,8 +65,12 @@ typedef struct _VipsResampleClass {
 
 GType vips_resample_get_type( void );
 
-int vips_reducehl3_get_points( VipsKernel kernel );
-void vips_reducehl3_make_mask( VipsKernel kernel, double x, double *c );
+/* The max size of the vector we use.
+ */
+#define MAX_POINTS (6)
+
+int vips_reduce_get_points( VipsKernel kernel );
+void vips_reduce_make_mask( VipsKernel kernel, double x, double *c );
 
 #ifdef __cplusplus
 }
diff --git a/libvips/resample/reducehl3.cpp b/libvips/resample/reducehl3.cpp
index 3bd2f15e..55804c27 100644
--- a/libvips/resample/reducehl3.cpp
+++ b/libvips/resample/reducehl3.cpp
@@ -64,10 +64,6 @@
  * 1D resampling kernels. 
  */
 
-/* The max size of the vector we use.
- */
-#define MAX_POINTS (6)
-
 typedef struct _VipsReducehl3 {
 	VipsResample parent_instance;
 
@@ -101,7 +97,7 @@ G_DEFINE_TYPE( VipsReducehl3, vips_reducehl3, VIPS_TYPE_RESAMPLE );
 /* Get n points.
  */
 int
-vips_reducehl3_get_points( VipsKernel kernel ) 
+vips_reduce_get_points( VipsKernel kernel ) 
 {
 	switch( kernel ) {
 	case VIPS_KERNEL_NEAREST:
@@ -128,7 +124,7 @@ vips_reducehl3_get_points( VipsKernel kernel )
 /* Calculate a mask.
  */
 void
-vips_reducehl3_make_mask( VipsKernel kernel, double x, double *c )
+vips_reduce_make_mask( VipsKernel kernel, double x, double *c )
 {
 	switch( kernel ) {
 	case VIPS_KERNEL_NEAREST:
@@ -136,8 +132,8 @@ vips_reducehl3_make_mask( VipsKernel kernel, double x, double *c )
 		break;
 
 	case VIPS_KERNEL_LINEAR:
-		c[0] = x;
-		c[1] = 1.0 - x;
+		c[0] = 1.0 - x;
+		c[1] = x;
 		break;
 
 	case VIPS_KERNEL_CUBIC:
@@ -192,22 +188,6 @@ reducehl3_unsigned_uint8_4tab( VipsPel *out, const VipsPel *in,
 	}
 }
 
-/* Our inner loop. Operate on elements of size T, gather results in an
- * intermediate of type IT.
- */
-template <typename T, typename IT>
-static IT
-reducehl3_sum( const T * restrict in, int bands, const IT * restrict c, int n )
-{
-	IT sum;
-
-	sum = 0; 
-	for( int i = 0; i < n; i++ )
-		sum += c[i] * in[i * bands];
-
-	return( sum ); 
-}
-
 template <typename T, int max_value>
 static void inline
 reducehl3_unsigned_int_tab( VipsReducehl3 *reducehl3,
@@ -216,11 +196,12 @@ reducehl3_unsigned_int_tab( VipsReducehl3 *reducehl3,
 {
 	T* restrict out = (T *) pout;
 	const T* restrict in = (T *) pin;
+	const int n = reducehl3->n_points;
 
 	for( int z = 0; z < bands; z++ ) {
 		int sum;
 	       
-		sum = reducehl3_sum<T, int>(in, bands, cx, reducehl3->n_points);
+		sum = reduce_sum<T, int>( in, bands, cx, n );
 		sum = unsigned_fixed_round( sum ); 
 		sum = VIPS_CLIP( 0, sum, max_value ); 
 		
@@ -238,11 +219,12 @@ reducehl3_signed_int_tab( VipsReducehl3 *reducehl3,
 {
 	T* restrict out = (T *) pout;
 	const T* restrict in = (T *) pin;
+	const int n = reducehl3->n_points;
 
 	for( int z = 0; z < bands; z++ ) {
 		int sum;
 
-		sum = reducehl3_sum<T, int>(in, bands, cx, reducehl3->n_points);
+		sum = reduce_sum<T, int>( in, bands, cx, n );
 		sum = signed_fixed_round( sum ); 
 		sum = VIPS_CLIP( min_value, sum, max_value ); 
 
@@ -262,10 +244,10 @@ reducehl3_float_tab( VipsReducehl3 *reducehl3,
 {
 	T* restrict out = (T *) pout;
 	const T* restrict in = (T *) pin;
+	const int n = reducehl3->n_points;
 
 	for( int z = 0; z < bands; z++ ) {
-		out[z] = reducehl3_sum<T, double>
-			(in, bands, cx, reducehl3->n_points);
+		out[z] = reduce_sum<T, double>( in, bands, cx, n );
 		in += 1;
 	}
 }
@@ -281,12 +263,12 @@ reducehl3_unsigned_int32_tab( VipsReducehl3 *reducehl3,
 {
 	T* restrict out = (T *) pout;
 	const T* restrict in = (T *) pin;
+	const int n = reducehl3->n_points;
 
 	for( int z = 0; z < bands; z++ ) {
 		double sum;
 
-		sum = reducehl3_sum<T, double>
-			(in, bands, cx, reducehl3->n_points);
+		sum = reduce_sum<T, double>( in, bands, cx, n );
 		out[z] = VIPS_CLIP( 0, sum, max_value ); 
 
 		in += 1;
@@ -301,12 +283,12 @@ reducehl3_signed_int32_tab( VipsReducehl3 *reducehl3,
 {
 	T* restrict out = (T *) pout;
 	const T* restrict in = (T *) pin;
+	const int n = reducehl3->n_points;
 
 	for( int z = 0; z < bands; z++ ) {
 		double sum;
 
-		sum = reducehl3_sum<T, double>
-			(in, bands, cx, reducehl3->n_points);
+		sum = reduce_sum<T, double>( in, bands, cx, n );
 		sum = VIPS_CLIP( min_value, sum, max_value ); 
 		out[z] = sum;
 
@@ -324,14 +306,14 @@ reducehl3_notab( VipsReducehl3 *reducehl3,
 {
 	T* restrict out = (T *) pout;
 	const T* restrict in = (T *) pin;
+	const int n = reducehl3->n_points;
 
 	double cx[MAX_POINTS];
 
-	vips_reducehl3_make_mask( reducehl3->kernel, x, cx ); 
+	vips_reduce_make_mask( reducehl3->kernel, x, cx ); 
 
 	for( int z = 0; z < bands; z++ ) {
-		out[z] = reducehl3_sum<T, double>
-			(in, bands, cx, reducehl3->n_points);
+		out[z] = reduce_sum<T, double>( in, bands, cx, n );
 
 		in += 1;
 	}
@@ -488,9 +470,9 @@ vips_reducehl3_build( VipsObject *object )
 
 	/* Build the tables of pre-computed coefficients.
 	 */
-	reducehl3->n_points = vips_reducehl3_get_points( reducehl3->kernel ); 
+	reducehl3->n_points = vips_reduce_get_points( reducehl3->kernel ); 
 	for( int x = 0; x < VIPS_TRANSFORM_SCALE + 1; x++ ) {
-		vips_reducehl3_make_mask( reducehl3->kernel, 
+		vips_reduce_make_mask( reducehl3->kernel, 
 			(float) x / VIPS_TRANSFORM_SCALE,
 			reducehl3->matrixf[x] );
 
@@ -575,7 +557,7 @@ vips_reducehl3_class_init( VipsReducehl3Class *reducehl3_class )
 
 	VIPS_ARG_ENUM( reducehl3_class, "kernel", 3, 
 		_( "Kernel" ), 
-		_( "Resamling kernel" ),
+		_( "Resampling kernel" ),
 		VIPS_ARGUMENT_OPTIONAL_INPUT,
 		G_STRUCT_OFFSET( VipsReducehl3, kernel ),
 		VIPS_TYPE_KERNEL, VIPS_KERNEL_CUBIC );
diff --git a/libvips/resample/reducevl3.cpp b/libvips/resample/reducevl3.cpp
index d291167d..e70ee469 100644
--- a/libvips/resample/reducevl3.cpp
+++ b/libvips/resample/reducevl3.cpp
@@ -2,6 +2,8 @@
  *
  * 29/1/16
  * 	- from shrinkv.c
+ * 10/3/16
+ * 	- add other kernels
  */
 
 /*
@@ -51,25 +53,34 @@
 #include "presample.h"
 #include "templates.h"
 
+/* The max size of the vector we use.
+ */
+#define MAX_POINTS (6)
+
 typedef struct _VipsReducevl3 {
 	VipsResample parent_instance;
 
 	double yshrink;		/* Shrink factor */
 
+	/* The thing we use to make the kernel.
+	 */
+	VipsKernel kernel;
+
+	/* Number of points in kernel.
+	 */
+	int n_points;
+
+	/* Precalculated interpolation matrices. int (used for pel
+	 * sizes up to short), and double (for all others). We go to
+	 * scale + 1 so we can round-to-nearest safely.
+	 */
+	int matrixi[VIPS_TRANSFORM_SCALE + 1][MAX_POINTS];
+	double matrixf[VIPS_TRANSFORM_SCALE + 1][MAX_POINTS];
+
 } VipsReducevl3;
 
 typedef VipsResampleClass VipsReducevl3Class;
 
-/* Precalculated interpolation matrices. int (used for pel
- * sizes up to short), and double (for all others). We go to
- * scale + 1 so we can round-to-nearest safely.
- */
-
-const int n_points = 6;
-
-static int vips_reducevl3_matrixi[VIPS_TRANSFORM_SCALE + 1][n_points];
-static double vips_reducevl3_matrixf[VIPS_TRANSFORM_SCALE + 1][n_points];
-
 /* We need C linkage for this.
  */
 extern "C" {
@@ -78,29 +89,33 @@ G_DEFINE_TYPE( VipsReducevl3, vips_reducevl3, VIPS_TYPE_RESAMPLE );
 
 template <typename T, int max_value>
 static void inline
-reducevl3_unsigned_int_tab( VipsPel *pout, const VipsPel *pin,
+reducevl3_unsigned_int_tab( VipsReducevl3 *reducevl3,
+	VipsPel *pout, const VipsPel *pin,
 	const int ne, const int lskip,
 	const int * restrict cy )
 {
 	T* restrict out = (T *) pout;
 	const T* restrict in = (T *) pin;
-
+	const int n = reducevl3->n_points;
 	const int l1 = lskip / sizeof( T );
+	const int round_by = VIPS_INTERPOLATE_SCALE >> 1;
 
 	for( int z = 0; z < ne; z++ ) {
 		int sum;
 
-		sum = 0;
-		for( int i = 0; i < n_points; i++ )
-			sum += cy[i] * in[i * l1];
+		sum = 0; 
+		for( int i = 0; i < n; i++ )
+			sum += cy[i] * in[z + i * l1];
 
-		sum = unsigned_fixed_round( sum ); 
+		sum = (sum + round_by) >> VIPS_INTERPOLATE_SHIFT;
 
-		sum = VIPS_CLIP( 0, sum, max_value ); 
+		//sum = reduce_sum<T, int>( in, l1, cy, n );
+		//sum = unsigned_fixed_round( sum ); 
+		//sum = VIPS_CLIP( 0, sum, max_value ); 
 
 		out[z] = sum;
 
-		in += 1;
+		//in += 1;
 	}
 }
 
@@ -129,7 +144,7 @@ vips_reducevl3_gen( VipsRegion *out_region, void *seq,
 	s.left = r->left;
 	s.top = r->top * reducevl3->yshrink;
 	s.width = r->width;
-	s.height = r->height * reducevl3->yshrink + n_points;
+	s.height = r->height * reducevl3->yshrink + reducevl3->n_points;
 	if( vips_region_prepare( ir, &s ) )
 		return( -1 );
 
@@ -142,14 +157,15 @@ vips_reducevl3_gen( VipsRegion *out_region, void *seq,
 		const int sy = Y * VIPS_TRANSFORM_SCALE * 2;
 		const int siy = sy & (VIPS_TRANSFORM_SCALE * 2 - 1);
 		const int ty = (siy + 1) >> 1;
-		const int *cyi = vips_reducevl3_matrixi[ty];
-		const double *cyf = vips_reducevl3_matrixf[ty];
+		const int *cyi = reducevl3->matrixi[ty];
+		const double *cyf = reducevl3->matrixf[ty];
 		const int lskip = VIPS_REGION_LSKIP( ir );
 
 		switch( in->BandFmt ) {
 		case VIPS_FORMAT_UCHAR:
 			reducevl3_unsigned_int_tab
 				<unsigned char, UCHAR_MAX>(
+				reducevl3,
 				q, p, ne, lskip, cyi );
 			break;
 
@@ -191,6 +207,19 @@ vips_reducevl3_build( VipsObject *object )
 	if( reducevl3->yshrink == 1 ) 
 		return( vips_image_write( in, resample->out ) );
 
+	/* Build the tables of pre-computed coefficients.
+	 */
+	reducevl3->n_points = vips_reduce_get_points( reducevl3->kernel ); 
+	for( int y = 0; y < VIPS_TRANSFORM_SCALE + 1; y++ ) {
+		vips_reduce_make_mask( reducevl3->kernel, 
+			(float) y / VIPS_TRANSFORM_SCALE,
+			reducevl3->matrixf[y] );
+
+		for( int i = 0; i < reducevl3->n_points; i++ )
+			reducevl3->matrixi[y][i] = reducevl3->matrixf[y][i] * 
+				VIPS_INTERPOLATE_SCALE;
+	}
+
 	/* Unpack for processing.
 	 */
 	if( vips_image_decode( in, &t[0] ) )
@@ -200,8 +229,8 @@ vips_reducevl3_build( VipsObject *object )
 	/* Add new pixels around the input so we can interpolate at the edges.
 	 */
 	if( vips_embed( in, &t[1], 
-		0, n_points / 2, 
-		in->Xsize, in->Ysize + n_points - 1, 
+		0, reducevl3->n_points / 2, 
+		in->Xsize, in->Ysize + reducevl3->n_points - 1, 
 		"extend", VIPS_EXTEND_COPY,
 		NULL ) )
 		return( -1 );
@@ -217,7 +246,7 @@ vips_reducevl3_build( VipsObject *object )
 	 * example, vipsthumbnail knows the true reduce factor (including the
 	 * fractional part), we just see the integer part here.
 	 */
-	resample->out->Ysize = (in->Ysize - n_points + 1) / reducevl3->yshrink;
+	resample->out->Ysize = (in->Ysize - reducevl3->n_points + 1) / reducevl3->yshrink;
 	if( resample->out->Ysize <= 0 ) { 
 		vips_error( object_class->nickname, 
 			"%s", _( "image has shrunk to nothing" ) );
@@ -264,24 +293,20 @@ vips_reducevl3_class_init( VipsReducevl3Class *reducevl3_class )
 		G_STRUCT_OFFSET( VipsReducevl3, yshrink ),
 		1, 1000000, 1 );
 
-	/* Build the tables of pre-computed coefficients.
-	 */
-	for( int y = 0; y < VIPS_TRANSFORM_SCALE + 1; y++ ) {
-		calculate_coefficients_lanczos( 3, 
-			(float) y / VIPS_TRANSFORM_SCALE,
-			vips_reducevl3_matrixf[y] );
+	VIPS_ARG_ENUM( reducevl3_class, "kernel", 3, 
+		_( "Kernel" ), 
+		_( "Resampling kernel" ),
+		VIPS_ARGUMENT_OPTIONAL_INPUT,
+		G_STRUCT_OFFSET( VipsReducevl3, kernel ),
+		VIPS_TYPE_KERNEL, VIPS_KERNEL_CUBIC );
 
-		for( int i = 0; i < n_points; i++ )
-			vips_reducevl3_matrixi[y][i] =
-				vips_reducevl3_matrixf[y][i] * 
-				VIPS_INTERPOLATE_SCALE;
-	}
 
 }
 
 static void
 vips_reducevl3_init( VipsReducevl3 *reducevl3 )
 {
+	reducevl3->kernel = VIPS_KERNEL_CUBIC;
 }
 
 /**
@@ -291,8 +316,12 @@ vips_reducevl3_init( VipsReducevl3 *reducevl3 )
  * @yshrink: horizontal reduce
  * @...: %NULL-terminated list of optional named arguments
  *
+ * Optional arguments:
+ *
+ * @kernel: #VipsKernel to use to interpolate (default: cubic)
+ *
  * Reduce @in vertically by a float factor. The pixels in @out are
- * interpolated with a 1D cubic mask. This operation will not work well for
+ * interpolated with a 1D mask. This operation will not work well for
  * a reduction of more than a factor of two.
  *
  * This is a very low-level operation: see vips_resize() for a more
diff --git a/libvips/resample/templates.h b/libvips/resample/templates.h
index e6087236..50fd6d51 100644
--- a/libvips/resample/templates.h
+++ b/libvips/resample/templates.h
@@ -339,3 +339,19 @@ calculate_coefficients_lanczos( int a, const double x, double *c )
 		c[i] = l;
 	}
 }
+
+/* Our inner loop for resampling with a convolution. Operate on elements of 
+ * size T, gather results in an intermediate of type IT.
+ */
+template <typename T, typename IT>
+static IT
+reduce_sum( const T * restrict in, int stride, const IT * restrict c, int n )
+{
+	IT sum;
+
+	sum = 0; 
+	for( int i = 0; i < n; i++ )
+		sum += c[i] * in[i * stride];
+
+	return( sum ); 
+}