From 3e78bdb6a99060ae0c31fd60b66f6421253d66f6 Mon Sep 17 00:00:00 2001
From: John Cupitt <jcupitt@gmail.com>
Date: Wed, 27 Jan 2016 14:34:06 +0000
Subject: [PATCH] reduceh is working!

---
 libvips/include/vips/resample.h               |   4 +-
 libvips/resample/Makefile.am                  |   3 +-
 libvips/resample/{bicubich.cpp => cubich.cpp} | 120 ++++++-------
 libvips/resample/interpolate.c                |   4 +-
 libvips/resample/reduceh.c                    | 155 ++++------------
 libvips/resample/templates.h                  | 168 +++++++-----------
 6 files changed, 164 insertions(+), 290 deletions(-)
 rename libvips/resample/{bicubich.cpp => cubich.cpp} (67%)

diff --git a/libvips/include/vips/resample.h b/libvips/include/vips/resample.h
index fb409841..749f4fea 100644
--- a/libvips/include/vips/resample.h
+++ b/libvips/include/vips/resample.h
@@ -46,8 +46,8 @@ int vips_shrinkv( VipsImage *in, VipsImage **out, int yshrink, ... );
 
 int vips_reduce( VipsImage *in, VipsImage **out, 
 	double xshrink, double yshrink, ... );
-int vips_reduceh( VipsImage *in, VipsImage **out, int xshrink, ... );
-int vips_reducev( VipsImage *in, VipsImage **out, int yshrink, ... );
+int vips_reduceh( VipsImage *in, VipsImage **out, double xshrink, ... );
+int vips_reducev( VipsImage *in, VipsImage **out, double yshrink, ... );
 
 int vips_similarity( VipsImage *in, VipsImage **out, ... )
 	__attribute__((sentinel));
diff --git a/libvips/resample/Makefile.am b/libvips/resample/Makefile.am
index 81e14990..e0d46487 100644
--- a/libvips/resample/Makefile.am
+++ b/libvips/resample/Makefile.am
@@ -9,10 +9,11 @@ libresample_la_SOURCES = \
 	shrink.c \
 	shrinkh.c \
 	shrinkv.c \
+	reduceh.c \
 	interpolate.c \
 	transform.c \
 	bicubic.cpp \
-	bicubich.cpp \
+	cubich.cpp \
 	lbb.cpp \
 	nohalo.cpp \
 	vsqbs.cpp \
diff --git a/libvips/resample/bicubich.cpp b/libvips/resample/cubich.cpp
similarity index 67%
rename from libvips/resample/bicubich.cpp
rename to libvips/resample/cubich.cpp
index 937ad1d0..4ca38b2a 100644
--- a/libvips/resample/bicubich.cpp
+++ b/libvips/resample/cubich.cpp
@@ -1,7 +1,7 @@
-/* 1D horizontal bicubich (catmull-rom) interpolator
+/* horizontal cubic (catmull-rom) interpolator
  *
  * 26/1/16
- * 	- from bicubich.cpp
+ * 	- from bicubic.cpp
  */
 
 /*
@@ -31,7 +31,7 @@
 
  */
 
-/* Bicubich (Catmull-Rom) interpolator derived from Nicolas Robidoux's
+/* Cubic (Catmull-Rom) interpolator derived from Nicolas Robidoux's
  * original YAFR resampler with permission and thanks.
  */
 
@@ -56,38 +56,38 @@
 #include <dmalloc.h>
 #endif /*WITH_DMALLOC*/
 
-#define VIPS_TYPE_INTERPOLATE_BICUBICH \
-	(vips_interpolate_bicubich_get_type())
-#define VIPS_INTERPOLATE_BICUBICH( obj ) \
+#define VIPS_TYPE_INTERPOLATE_CUBICH \
+	(vips_interpolate_cubich_get_type())
+#define VIPS_INTERPOLATE_CUBICH( obj ) \
 	(G_TYPE_CHECK_INSTANCE_CAST( (obj), \
-	VIPS_TYPE_INTERPOLATE_BICUBICH, VipsInterpolateBicubich ))
-#define VIPS_INTERPOLATE_BICUBICH_CLASS( klass ) \
+	VIPS_TYPE_INTERPOLATE_CUBICH, VipsInterpolateCubich ))
+#define VIPS_INTERPOLATE_CUBICH_CLASS( klass ) \
 	(G_TYPE_CHECK_CLASS_CAST( (klass), \
-	VIPS_TYPE_INTERPOLATE_BICUBICH, VipsInterpolateBicubichClass))
-#define VIPS_IS_INTERPOLATE_BICUBICH( obj ) \
-	(G_TYPE_CHECK_INSTANCE_TYPE( (obj), VIPS_TYPE_INTERPOLATE_BICUBICH ))
-#define VIPS_IS_INTERPOLATE_BICUBICH_CLASS( klass ) \
-	(G_TYPE_CHECK_CLASS_TYPE( (klass), VIPS_TYPE_INTERPOLATE_BICUBICH ))
-#define VIPS_INTERPOLATE_BICUBICH_GET_CLASS( obj ) \
+	VIPS_TYPE_INTERPOLATE_CUBICH, VipsInterpolateCubichClass))
+#define VIPS_IS_INTERPOLATE_CUBICH( obj ) \
+	(G_TYPE_CHECK_INSTANCE_TYPE( (obj), VIPS_TYPE_INTERPOLATE_CUBICH ))
+#define VIPS_IS_INTERPOLATE_CUBICH_CLASS( klass ) \
+	(G_TYPE_CHECK_CLASS_TYPE( (klass), VIPS_TYPE_INTERPOLATE_CUBICH ))
+#define VIPS_INTERPOLATE_CUBICH_GET_CLASS( obj ) \
 	(G_TYPE_INSTANCE_GET_CLASS( (obj), \
-	VIPS_TYPE_INTERPOLATE_BICUBICH, VipsInterpolateBicubichClass ))
+	VIPS_TYPE_INTERPOLATE_CUBICH, VipsInterpolateCubichClass ))
 
-typedef VipsInterpolate VipsInterpolateBicubich;
+typedef VipsInterpolate VipsInterpolateCubich;
 
-typedef VipsInterpolateClass VipsInterpolateBicubichClass;
+typedef VipsInterpolateClass VipsInterpolateCubichClass;
 
 /* Precalculated interpolation matrices. int (used for pel
  * sizes up to short), and double (for all others). We go to
  * scale + 1 so we can round-to-nearest safely.
  */
 
-static int vips_bicubich_matrixi[VIPS_TRANSFORM_SCALE + 1][4];
-static double vips_bicubich_matrixf[VIPS_TRANSFORM_SCALE + 1][4];
+static int vips_cubich_matrixi[VIPS_TRANSFORM_SCALE + 1][4];
+static double vips_cubich_matrixf[VIPS_TRANSFORM_SCALE + 1][4];
 
 /* We need C linkage for this.
  */
 extern "C" {
-G_DEFINE_TYPE( VipsInterpolateBicubich, vips_interpolate_bicubich,
+G_DEFINE_TYPE( VipsInterpolateCubich, vips_interpolate_cubich,
 	VIPS_TYPE_INTERPOLATE );
 }
 
@@ -103,7 +103,7 @@ G_DEFINE_TYPE( VipsInterpolateBicubich, vips_interpolate_bicubich,
 
 template <typename T, int max_value>
 static void inline
-bicubich_unsigned_int_tab( void *pout, const VipsPel *pin,
+cubich_unsigned_int_tab( void *pout, const VipsPel *pin,
 	const int bands, const int *cx )
 {
 	T* restrict out = (T *) pout;
@@ -119,12 +119,12 @@ bicubich_unsigned_int_tab( void *pout, const VipsPel *pin,
 		const T thr = in[b2];
 		const T fou = in[b3];
 
-		int bicubich = bicubic1d_unsigned_int<T>(
+		int cubich = cubic_unsigned_int<T>(
 			one, two, thr, fou, cx );
 
-		bicubich = VIPS_CLIP( 0, bicubich, max_value ); 
+		cubich = VIPS_CLIP( 0, cubich, max_value ); 
 
-		out[z] = bicubich;
+		out[z] = cubich;
 
 		in += 1;
 	}
@@ -132,7 +132,7 @@ bicubich_unsigned_int_tab( void *pout, const VipsPel *pin,
 
 template <typename T, int min_value, int max_value>
 static void inline
-bicubich_signed_int_tab( void *pout, const VipsPel *pin,
+cubich_signed_int_tab( void *pout, const VipsPel *pin,
 	const int bands, const int *cx )
 {
 	T* restrict out = (T *) pout;
@@ -148,12 +148,12 @@ bicubich_signed_int_tab( void *pout, const VipsPel *pin,
 		const T thr = in[b2];
 		const T fou = in[b3];
 
-		int bicubich = bicubic1d_signed_int<T>(
+		int cubich = cubic_signed_int<T>(
 			one, two, thr, fou, cx );
 
-		bicubich = VIPS_CLIP( min_value, bicubich, max_value ); 
+		cubich = VIPS_CLIP( min_value, cubich, max_value ); 
 
-		out[z] = bicubich;
+		out[z] = cubich;
 
 		in += 1;
 	}
@@ -163,7 +163,7 @@ bicubich_signed_int_tab( void *pout, const VipsPel *pin,
  */
 template <typename T>
 static void inline
-bicubich_float_tab( void *pout, const VipsPel *pin,
+cubich_float_tab( void *pout, const VipsPel *pin,
 	const int bands, const double *cx )
 {
 	T* restrict out = (T *) pout;
@@ -179,10 +179,10 @@ bicubich_float_tab( void *pout, const VipsPel *pin,
 		const T thr = in[b2];
 		const T fou = in[b3];
 
-		const T bicubich = bicubic1d_float<T>(
+		const T cubich = cubic_float<T>(
 			one, two, thr, fou, cx );
 
-		out[z] = bicubich;
+		out[z] = cubich;
 
 		in += 1;
 	}
@@ -192,7 +192,7 @@ bicubich_float_tab( void *pout, const VipsPel *pin,
  */
 template <typename T>
 static void inline
-bicubich_notab( void *pout, const VipsPel *pin,
+cubich_notab( void *pout, const VipsPel *pin,
 	const int bands, double x )
 {
 	T* restrict out = (T *) pout;
@@ -212,17 +212,17 @@ bicubich_notab( void *pout, const VipsPel *pin,
 		const T thr = in[b2];
 		const T fou = in[b3];
 
-		const T bicubich = bicubic1d_float<T>(
+		const T cubich = cubic_float<T>(
 			one, two, thr, fou, cx );
 
-		out[z] = bicubich;
+		out[z] = cubich;
 
 		in += 1;
 	}
 }
 
 static void
-vips_interpolate_bicubich_interpolate( VipsInterpolate *interpolate,
+vips_interpolate_cubich_interpolate( VipsInterpolate *interpolate,
 	void *out, VipsRegion *in, double x, double y )
 {
 	/* Find the mask index. We round-to-nearest, so we need to generate 
@@ -247,8 +247,8 @@ vips_interpolate_bicubich_interpolate( VipsInterpolate *interpolate,
 
 	/* Look up the tables we need.
 	 */
-	const int *cxi = vips_bicubich_matrixi[tx];
-	const double *cxf = vips_bicubich_matrixf[tx];
+	const int *cxi = vips_cubich_matrixi[tx];
+	const double *cxf = vips_cubich_matrixf[tx];
 
 	/* Pel size and line size.
 	 */
@@ -265,7 +265,7 @@ vips_interpolate_bicubich_interpolate( VipsInterpolate *interpolate,
 	g_assert( x >= 1.0 );
 
 #ifdef DEBUG
-	printf( "vips_interpolate_bicubich_interpolate: %g %g\n", x, y );
+	printf( "vips_interpolate_cubich_interpolate: %g %g\n", x, y );
 	printf( "\tleft=%d, top=%d, width=%d, height=%d\n",
 		ix - 1, iy, 4, 1 );
 	printf( "\tmaskx=%d\n", tx );
@@ -273,17 +273,17 @@ vips_interpolate_bicubich_interpolate( VipsInterpolate *interpolate,
 
 	switch( in->im->BandFmt ) {
 	case VIPS_FORMAT_UCHAR:
-		bicubich_unsigned_int_tab<unsigned char, UCHAR_MAX>( 
+		cubich_unsigned_int_tab<unsigned char, UCHAR_MAX>( 
 			out, p, bands, cxi );
 
 	/*
 
 	   Handy for benchmarking
 
-		bicubich_float_tab<unsigned char>(
+		cubich_float_tab<unsigned char>(
 			out, p, bands, cxf );
 
-		bicubich_notab<unsigned char>(
+		cubich_notab<unsigned char>(
 			out, p, bands, x - ix );
 
 	 */
@@ -291,47 +291,47 @@ vips_interpolate_bicubich_interpolate( VipsInterpolate *interpolate,
 		break;
 
 	case VIPS_FORMAT_CHAR:
-		bicubich_signed_int_tab<signed char, SCHAR_MIN, SCHAR_MAX>(
+		cubich_signed_int_tab<signed char, SCHAR_MIN, SCHAR_MAX>(
 			out, p, bands, cxi );
 		break;
 
 	case VIPS_FORMAT_USHORT:
-		bicubich_unsigned_int_tab<unsigned short, USHRT_MAX>(
+		cubich_unsigned_int_tab<unsigned short, USHRT_MAX>(
 			out, p, bands, cxi );
 		break;
 
 	case VIPS_FORMAT_SHORT:
-		bicubich_signed_int_tab<signed short, SHRT_MIN, SHRT_MAX>(
+		cubich_signed_int_tab<signed short, SHRT_MIN, SHRT_MAX>(
 			out, p, bands, cxi );
 		break;
 
 	case VIPS_FORMAT_UINT:
-		bicubich_float_tab<unsigned int>( 
+		cubich_float_tab<unsigned int>( 
 			out, p, bands, cxf );
 		break;
 
 	case VIPS_FORMAT_INT:
-		bicubich_float_tab<signed int>( 
+		cubich_float_tab<signed int>( 
 			out, p, bands, cxf );
 		break;
 
 	case VIPS_FORMAT_FLOAT:
-		bicubich_float_tab<float>( 
+		cubich_float_tab<float>( 
 			out, p, bands, cxf );
 		break;
 
 	case VIPS_FORMAT_DOUBLE:
-		bicubich_notab<double>( 
+		cubich_notab<double>( 
 			out, p, bands, x - ix );
 		break;
 
 	case VIPS_FORMAT_COMPLEX:
-		bicubich_float_tab<float>( 
+		cubich_float_tab<float>( 
 			out, p, bands * 2, cxf );
 		break;
 
 	case VIPS_FORMAT_DPCOMPLEX:
-		bicubich_notab<double>( 
+		cubich_notab<double>( 
 			out, p, bands * 2, x - ix );
 		break;
 
@@ -341,17 +341,17 @@ vips_interpolate_bicubich_interpolate( VipsInterpolate *interpolate,
 }
 
 static void
-vips_interpolate_bicubich_class_init( VipsInterpolateBicubichClass *iclass )
+vips_interpolate_cubich_class_init( VipsInterpolateCubichClass *iclass )
 {
 	VipsObjectClass *object_class = VIPS_OBJECT_CLASS( iclass );
 	VipsInterpolateClass *interpolate_class =
 		VIPS_INTERPOLATE_CLASS( iclass );
 
-	object_class->nickname = "bicubich";
+	object_class->nickname = "cubich";
 	object_class->description = 
-		_( "horizontal bicubic interpolation (Catmull-Rom)" );
+		_( "horizontal cubic interpolation (Catmull-Rom)" );
 
-	interpolate_class->interpolate = vips_interpolate_bicubich_interpolate;
+	interpolate_class->interpolate = vips_interpolate_cubich_interpolate;
 	interpolate_class->window_size = 4;
 
 	/* Build the tables of pre-computed coefficients.
@@ -359,21 +359,21 @@ vips_interpolate_bicubich_class_init( VipsInterpolateBicubichClass *iclass )
 	for( int x = 0; x < VIPS_TRANSFORM_SCALE + 1; x++ ) {
 		calculate_coefficients_catmull(
 			(float) x / VIPS_TRANSFORM_SCALE,
-			vips_bicubich_matrixf[x] );
+			vips_cubich_matrixf[x] );
 
 		for( int i = 0; i < 4; i++ )
-			vips_bicubich_matrixi[x][i] =
-				vips_bicubich_matrixf[x][i] * 
+			vips_cubich_matrixi[x][i] =
+				vips_cubich_matrixf[x][i] * 
 				VIPS_INTERPOLATE_SCALE;
 	}
 }
 
 static void
-vips_interpolate_bicubich_init( VipsInterpolateBicubich *bicubich )
+vips_interpolate_cubich_init( VipsInterpolateCubich *cubich )
 {
 #ifdef DEBUG
-	printf( "vips_interpolate_bicubich_init: " );
-	vips_object_print( VIPS_OBJECT( bicubich ) );
+	printf( "vips_interpolate_cubich_init: " );
+	vips_object_print( VIPS_OBJECT( cubich ) );
 #endif /*DEBUG*/
 
 }
diff --git a/libvips/resample/interpolate.c b/libvips/resample/interpolate.c
index 78124f6a..bccb0fe8 100644
--- a/libvips/resample/interpolate.c
+++ b/libvips/resample/interpolate.c
@@ -600,7 +600,7 @@ void
 vips__interpolate_init( void )
 {
 	extern GType vips_interpolate_bicubic_get_type( void );
-	extern GType vips_interpolate_bicubich_get_type( void );
+	extern GType vips_interpolate_cubich_get_type( void );
 	extern GType vips_interpolate_lbb_get_type( void );
 	extern GType vips_interpolate_nohalo_get_type( void );
 	extern GType vips_interpolate_vsqbs_get_type( void );
@@ -609,7 +609,7 @@ vips__interpolate_init( void )
 	vips_interpolate_bilinear_get_type();
 
 	vips_interpolate_bicubic_get_type();
-	vips_interpolate_bicubich_get_type();
+	vips_interpolate_cubich_get_type();
 	vips_interpolate_lbb_get_type();
 	vips_interpolate_nohalo_get_type();
 	vips_interpolate_vsqbs_get_type();
diff --git a/libvips/resample/reduceh.c b/libvips/resample/reduceh.c
index aee18902..66c7c9bd 100644
--- a/libvips/resample/reduceh.c
+++ b/libvips/resample/reduceh.c
@@ -62,136 +62,57 @@ typedef VipsResampleClass VipsReducehClass;
 
 G_DEFINE_TYPE( VipsReduceh, vips_reduceh, VIPS_TYPE_RESAMPLE );
 
-#define INNER( BANDS ) \
-	sum += p[x1]; \
-	x1 += BANDS; 
-
-/* Integer reduce. 
- */
-#define IREDUCE( TYPE, BANDS ) { \
-	TYPE * restrict p = (TYPE *) in; \
-	TYPE * restrict q = (TYPE *) out; \
-	\
-	for( x = 0; x < width; x++ ) { \
-		for( b = 0; b < BANDS; b++ ) { \
-			int sum; \
-			\
-			sum = 0; \
-			x1 = b; \
-			VIPS_UNROLL( reduce->xreduce, INNER( BANDS ) ); \
-			q[b] = (sum + reduce->xreduce / 2) / \
-				reduce->xreduce; \
-		} \
-		p += ne; \
-		q += BANDS; \
-	} \
-}
-
-/* Float reduce. 
- */
-#define FREDUCE( TYPE ) { \
-	TYPE * restrict p = (TYPE *) in; \
-	TYPE * restrict q = (TYPE *) out; \
-	\
-	for( x = 0; x < width; x++ ) { \
-		for( b = 0; b < bands; b++ ) { \
-			double sum; \
-			\
-			sum = 0.0; \
-			x1 = b; \
-			VIPS_UNROLL( reduce->xreduce, INNER( bands ) ); \
-			q[b] = sum / reduce->xreduce; \
-		} \
-		p += ne; \
-		q += bands; \
-	} \
-} 
-
-/* Generate an line of @or. @ir is large enough.
- */
-static void
-vips_reduceh_gen2( VipsReduceh *reduce, VipsRegion *or, VipsRegion *ir,
-	int left, int top, int width )
-{
-	VipsResample *resample = VIPS_RESAMPLE( reduce );
-	const int bands = resample->in->Bands * 
-		(vips_band_format_iscomplex( resample->in->BandFmt ) ? 
-		 	2 : 1);
-	const int ne = reduce->xreduce * bands; 
-	VipsPel *out = VIPS_REGION_ADDR( or, left, top ); 
-	VipsPel *in = VIPS_REGION_ADDR( ir, left * reduce->xreduce, top ); 
-
-	int x;
-	int x1, b;
-
-	switch( resample->in->BandFmt ) {
-		IREDUCE( unsigned char, bands ); break;
-	case VIPS_FORMAT_CHAR: 	
-		IREDUCE( char, bands ); break; 
-	case VIPS_FORMAT_USHORT: 
-		IREDUCE( unsigned short, bands ); break;
-	case VIPS_FORMAT_SHORT: 	
-		IREDUCE( short, bands ); break; 
-	case VIPS_FORMAT_UINT: 	
-		IREDUCE( unsigned int, bands ); break; 
-	case VIPS_FORMAT_INT: 	
-		IREDUCE( int, bands );  break; 
-	case VIPS_FORMAT_FLOAT: 	
-		FREDUCE( float ); break; 
-	case VIPS_FORMAT_DOUBLE:	
-		FREDUCE( double ); break;
-	case VIPS_FORMAT_COMPLEX: 	
-		FREDUCE( float ); break; 
-	case VIPS_FORMAT_DPCOMPLEX:	
-		FREDUCE( double ); break;
-
-	default:
-		g_assert_not_reached(); 
-	}
-}
-
 static int
 vips_reduceh_gen( VipsRegion *or, void *seq, 
 	void *a, void *b, gboolean *stop )
 {
-	VipsReduceh *reduce = (VipsReduceh *) b;
+	VipsImage *in = (VipsImage *) a;
+	VipsReduceh *reduceh = (VipsReduceh *) b;
+	int window_size = 
+		vips_interpolate_get_window_size( reduceh->interpolate );
+	int window_offset = 
+		vips_interpolate_get_window_offset( reduceh->interpolate );
+	const VipsInterpolateMethod interpolate = 
+		vips_interpolate_get_method( reduceh->interpolate );
+	int ps = VIPS_IMAGE_SIZEOF_PEL( in );
 	VipsRegion *ir = (VipsRegion *) seq;
 	VipsRect *r = &or->valid;
 
+	VipsRect s;
 	int y;
 
-	/* How do we chunk up the image? We don't want to prepare the whole of
-	 * the input region corresponding to *r since it could be huge. 
-	 *
-	 * Request input a line at a time. 
-	 */
-
 #ifdef DEBUG
 	printf( "vips_reduceh_gen: generating %d x %d at %d x %d\n",
 		r->width, r->height, r->left, r->top ); 
 #endif /*DEBUG*/
 
+	s.left = r->left * reduceh->xreduce - window_offset;
+	s.top = r->top;
+	s.width = r->width * reduceh->xreduce + window_size - 1;
+	s.height = r->height;
+	if( vips_region_prepare( ir, &s ) )
+		return( -1 );
+
+	VIPS_GATE_START( "vips_reduceh_gen: work" ); 
+
 	for( y = 0; y < r->height; y ++ ) { 
-		VipsRect s;
+		VipsPel *q = VIPS_REGION_ADDR( or, r->left, r->top + y ); 
+		double Y = r->top + y; 
 
-		s.left = r->left * reduce->xreduce;
-		s.top = r->top + y;
-		s.width = r->width * reduce->xreduce;
-		s.height = 1;
-#ifdef DEBUG
-		printf( "reduceh_gen: requesting line %d\n", s.top ); 
-#endif /*DEBUG*/
-		if( vips_region_prepare( ir, &s ) )
-			return( -1 );
+		int x;
 
-		VIPS_GATE_START( "vips_reduceh_gen: work" ); 
+		for( x = 0; x < r->width; x++ ) { 
+			double X = window_offset + 
+				(r->left + x) * reduceh->xreduce; 
 
-		vips_reduceh_gen2( reduce, or, ir, 
-			r->left, r->top + y, r->width );
+			interpolate( reduceh->interpolate, q, ir, X, Y );
 
-		VIPS_GATE_STOP( "vips_reduceh_gen: work" ); 
+			q += ps;
+		}
 	}
 
+	VIPS_GATE_STOP( "vips_reduceh_gen: work" ); 
+
 	return( 0 );
 }
 
@@ -200,7 +121,7 @@ vips_reduceh_build( VipsObject *object )
 {
 	VipsObjectClass *class = VIPS_OBJECT_GET_CLASS( object );
 	VipsResample *resample = VIPS_RESAMPLE( object );
-	VipsReduceh *reduce = (VipsReduceh *) object;
+	VipsReduceh *reduceh = (VipsReduceh *) object;
 	VipsImage **t = (VipsImage **) 
 		vips_object_local_array( object, 1 );
 
@@ -236,16 +157,16 @@ vips_reduceh_build( VipsObject *object )
 	window_offset = 
 		vips_interpolate_get_window_offset( reduceh->interpolate );
 
-	if( reduce->xreduce < 1 ) { 
+	if( reduceh->xreduce < 1 ) { 
 		vips_error( class->nickname, 
 			"%s", _( "reduce factors should be >= 1" ) );
 		return( -1 );
 	}
-	if( reduce->xreduce > 2 )  
+	if( reduceh->xreduce > 2 )  
 		vips_warn( class->nickname, 
 			"%s", _( "reduce factor greater than 2" ) );
 
-	if( reduce->xreduce == 1 ) 
+	if( reduceh->xreduce == 1 ) 
 		return( vips_image_write( in, resample->out ) );
 
 	/* Unpack for processing.
@@ -278,7 +199,7 @@ vips_reduceh_build( VipsObject *object )
 	 * example, vipsthumbnail knows the true reduce factor (including the
 	 * fractional part), we just see the integer part here.
 	 */
-	resample->out->Xsize = in->Xsize / reduce->xreduce;
+	resample->out->Xsize = (in->Xsize - window_size + 1) / reduceh->xreduce;
 	if( resample->out->Xsize <= 0 ) { 
 		vips_error( class->nickname, 
 			"%s", _( "image has shrunk to nothing" ) );
@@ -293,7 +214,7 @@ vips_reduceh_build( VipsObject *object )
 
 	if( vips_image_generate( resample->out,
 		vips_start_one, vips_reduceh_gen, vips_stop_one, 
-		in, reduce ) )
+		in, reduceh ) )
 		return( -1 );
 
 	return( 0 );
@@ -333,7 +254,7 @@ vips_reduceh_class_init( VipsReducehClass *class )
 }
 
 static void
-vips_reduceh_init( VipsReduceh *reduce )
+vips_reduceh_init( VipsReduceh *reduceh )
 {
 }
 
@@ -345,7 +266,7 @@ vips_reduceh_init( VipsReduceh *reduce )
  * @...: %NULL-terminated list of optional named arguments
  *
  * Reduce @in horizontally by a float factor. The pixels in @out are
- * interpolated with a 1D bicubic mask. This operation will not work well for
+ * interpolated with a 1D cubic mask. This operation will not work well for
  * a reduction of more than a factor of two.
  *
  * This is a very low-level operation: see vips_resize() for a more
diff --git a/libvips/resample/templates.h b/libvips/resample/templates.h
index b5437721..efd160ba 100644
--- a/libvips/resample/templates.h
+++ b/libvips/resample/templates.h
@@ -154,49 +154,8 @@ unsigned_fixed_round( int v )
 	return( (v + round_by) >> VIPS_INTERPOLATE_SHIFT );
 }
 
-/* Fixed-point integer bicubic, used for 8 and 16-bit types.
- */
 template <typename T> static int inline
-bicubic_unsigned_int(
-	const T uno_one, const T uno_two, const T uno_thr, const T uno_fou,
-	const T dos_one, const T dos_two, const T dos_thr, const T dos_fou,
-	const T tre_one, const T tre_two, const T tre_thr, const T tre_fou,
-	const T qua_one, const T qua_two, const T qua_thr, const T qua_fou,
-	const int* restrict cx, const int* restrict cy )
-{
-	const int r0 = unsigned_fixed_round( 
-		cx[0] * uno_one +
-		cx[1] * uno_two +
-		cx[2] * uno_thr +
-		cx[3] * uno_fou ); 
-
-	const int r1 = unsigned_fixed_round( 
-		cx[0] * dos_one +
-		cx[1] * dos_two +
-		cx[2] * dos_thr +
-		cx[3] * dos_fou );
-
-	const int r2 = unsigned_fixed_round( 
-		cx[0] * tre_one +
-		cx[1] * tre_two +
-		cx[2] * tre_thr +
-		cx[3] * tre_fou );
-
-	const int r3 = unsigned_fixed_round( 
-		cx[0] * qua_one +
-		cx[1] * qua_two +
-		cx[2] * qua_thr +
-		cx[3] * qua_fou );
-
-	return( unsigned_fixed_round( 
-		cy[0] * r0 +
-		cy[1] * r1 +
-		cy[2] * r2 +
-		cy[3] * r3 ) );
-}
-
-template <typename T> static int inline
-bicubic1d_unsigned_int(
+cubic_unsigned_int(
 	const T one, const T two, const T thr, const T fou,
 	const int* restrict cx )
 {
@@ -207,6 +166,28 @@ bicubic1d_unsigned_int(
 		cx[3] * fou ) ); 
 }
 
+/* Fixed-point integer bicubic, used for 8 and 16-bit types.
+ */
+template <typename T> static int inline
+bicubic_unsigned_int(
+	const T uno_one, const T uno_two, const T uno_thr, const T uno_fou,
+	const T dos_one, const T dos_two, const T dos_thr, const T dos_fou,
+	const T tre_one, const T tre_two, const T tre_thr, const T tre_fou,
+	const T qua_one, const T qua_two, const T qua_thr, const T qua_fou,
+	const int* restrict cx, const int* restrict cy )
+{
+	const int r0 = cubic_unsigned_int<T>( 
+		uno_one, uno_two, uno_thr, uno_fou, cx ); 
+	const int r1 = cubic_unsigned_int<T>( 
+		dos_one, dos_two, dos_thr, dos_fou, cx ); 
+	const int r2 = cubic_unsigned_int<T>( 
+		tre_one, tre_two, tre_thr, tre_fou, cx ); 
+	const int r3 = cubic_unsigned_int<T>( 
+		qua_one, qua_two, qua_thr, qua_fou, cx ); 
+
+	return( cubic_unsigned_int<T>( r0, r1, r2, r3, cy ) ); 
+}
+
 static int inline
 signed_fixed_round( int v )
 {
@@ -216,6 +197,18 @@ signed_fixed_round( int v )
 	return( (v + round_by) >> VIPS_INTERPOLATE_SHIFT );
 }
 
+template <typename T> static int inline
+cubic_signed_int(
+	const T one, const T two, const T thr, const T fou,
+	const int* restrict cx )
+{
+	return( signed_fixed_round( 
+		cx[0] * one +
+		cx[1] * two +
+		cx[2] * thr +
+		cx[3] * fou ) ); 
+}
+
 /* Fixed-point integer bicubic, used for 8 and 16-bit types.
  */
 template <typename T> static int inline
@@ -226,47 +219,27 @@ bicubic_signed_int(
 	const T qua_one, const T qua_two, const T qua_thr, const T qua_fou,
 	const int* restrict cx, const int* restrict cy )
 {
-	const int r0 = signed_fixed_round( 
-		cx[0] * uno_one +
-		cx[1] * uno_two +
-		cx[2] * uno_thr +
-		cx[3] * uno_fou ); 
+	const int r0 = cubic_signed_int<T>( 
+		uno_one, uno_two, uno_thr, uno_fou, cx ); 
+	const int r1 = cubic_signed_int<T>( 
+		dos_one, dos_two, dos_thr, dos_fou, cx ); 
+	const int r2 = cubic_signed_int<T>( 
+		tre_one, tre_two, tre_thr, tre_fou, cx ); 
+	const int r3 = cubic_signed_int<T>( 
+		qua_one, qua_two, qua_thr, qua_fou, cx ); 
 
-	const int r1 = signed_fixed_round( 
-		cx[0] * dos_one +
-		cx[1] * dos_two +
-		cx[2] * dos_thr +
-		cx[3] * dos_fou );
-
-	const int r2 = signed_fixed_round( 
-		cx[0] * tre_one +
-		cx[1] * tre_two +
-		cx[2] * tre_thr +
-		cx[3] * tre_fou );
-
-	const int r3 = signed_fixed_round( 
-		cx[0] * qua_one +
-		cx[1] * qua_two +
-		cx[2] * qua_thr +
-		cx[3] * qua_fou );
-
-	return( signed_fixed_round( 
-		cy[0] * r0 +
-		cy[1] * r1 +
-		cy[2] * r2 +
-		cy[3] * r3 ) );
+	return( cubic_signed_int<T>( r0, r1, r2, r3, cy ) ); 
 }
 
-template <typename T> static int inline
-bicubic1d_signed_int(
+template <typename T> static T inline
+cubic_float(
 	const T one, const T two, const T thr, const T fou,
-	const int* restrict cx )
+	const double* restrict cx )
 {
-	return( signed_fixed_round( 
-		cx[0] * one +
-		cx[1] * two +
-		cx[2] * thr +
-		cx[3] * fou ) ); 
+	return( cx[0] * one +
+		 cx[1] * two +
+		 cx[2] * thr +
+		 cx[3] * fou );
 }
 
 /* Floating-point bicubic, used for int/float/double types.
@@ -279,37 +252,16 @@ bicubic_float(
 	const T qua_one, const T qua_two, const T qua_thr, const T qua_fou,
 	const double* restrict cx, const double* restrict cy )
 {
-	return(
-		cy[0] * (cx[0] * uno_one +
-			 cx[1] * uno_two +
-			 cx[2] * uno_thr +
-			 cx[3] * uno_fou)
-                +
-		cy[1] * (cx[0] * dos_one +
-			 cx[1] * dos_two +
-			 cx[2] * dos_thr +
-			 cx[3] * dos_fou)
-                +
-		cy[2] * (cx[0] * tre_one +
-			 cx[1] * tre_two +
-			 cx[2] * tre_thr +
-			 cx[3] * tre_fou)
-                +
-		cy[3] * (cx[0] * qua_one +
-			 cx[1] * qua_two +
-			 cx[2] * qua_thr +
-			 cx[3] * qua_fou) );
-}
+	const double r0 = cubic_float<T>( 
+		uno_one, uno_two, uno_thr, uno_fou, cx ); 
+	const double r1 = cubic_float<T>( 
+		dos_one, dos_two, dos_thr, dos_fou, cx ); 
+	const double r2 = cubic_float<T>( 
+		tre_one, tre_two, tre_thr, tre_fou, cx ); 
+	const double r3 = cubic_float<T>( 
+		qua_one, qua_two, qua_thr, qua_fou, cx ); 
 
-template <typename T> static T inline
-bicubic1d_float(
-	const T one, const T two, const T thr, const T fou,
-	const double* restrict cx )
-{
-	return( cx[0] * one +
-		 cx[1] * two +
-		 cx[2] * thr +
-		 cx[3] * fou );
+	return( cubic_float<T>( r0, r1, r2, r3, cy ) ); 
 }
 
 /* Given an offset in [0,1] (we can have x == 1 when building tables),