diff --git a/.travis.yml b/.travis.yml
index f105dad2..27aab72d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,7 +1,7 @@
 language: cpp
 
 before_script:
-  - ./bootstrap.sh
+  - ./autogen.sh
   - ./configure
     --disable-dependency-tracking
     --with-jpeg-includes=$JPEG/include
diff --git a/ChangeLog b/ChangeLog
index dc416a0c..697dbd6a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -29,9 +29,14 @@
 - better upsizing with vips_resize()
 - add imagemagick v7 support, thanks sachinwalia2k8 
 - added vips_worley(), vips_perlin() noise generators
+- added vips_convf(), vips_convi(), vips_convasep(), vips_conva() ... 
+  im_conv*() functions rewritten as classes
+- vips_convsep() calls vips_convasep() for the approximate case
+- new fixed-point vector path for convi is up to about 2x faster
 - gif loader can write 1, 2, 3, or 4 bands depending on file contents
 - support --strip for pngsave
 - add svgz support [Felix Bünemann]
+- rename boostrap.sh -> autogen.sh to help snapcraft
 
 30/7/16 started 8.3.3
 - fix performance regression in 8.3.2, thanks Lovell
diff --git a/README.md b/README.md
index bb8c6e4f..bbac874b 100644
--- a/README.md
+++ b/README.md
@@ -80,7 +80,7 @@ and `gobject-introspection`, see the dependencies section below. For example:
 
 Then build the build system with:
 
-	$ ./bootstrap.sh
+	$ ./autogen.sh
 
 Debug build:
 
diff --git a/TODO b/TODO
index 4102778e..107925ec 100644
--- a/TODO
+++ b/TODO
@@ -16,18 +16,7 @@
 	all with file and buffer source
 
 
-- try:
-
-	$ vips avg broken.jpg[fail]
-
-  about 50% of the time it'll trigger a range of out-of-order reads and lock
-  for 10s or so while seq times out
-
-- add more webp tests to py suite
-
-- try moving some more of the CLI tests to py
-
-
+- add APPROX convsep test? 
 
 - add more webp tests to py suite
 
diff --git a/bootstrap.sh b/autogen.sh
similarity index 100%
rename from bootstrap.sh
rename to autogen.sh
diff --git a/libvips/conversion/sequential.c b/libvips/conversion/sequential.c
index c6a6bbfc..62b19c35 100644
--- a/libvips/conversion/sequential.c
+++ b/libvips/conversion/sequential.c
@@ -184,7 +184,7 @@ vips_sequential_generate( VipsRegion *or,
 		/* Exit the loop on timeout or condition passes. We have to
 		 * be wary of spurious wakeups. 
 		 */
-		while( r->top > sequential->y_pos )
+		while( r->top > sequential->y_pos ) {
 #ifdef HAVE_COND_INIT
 			if( !g_cond_wait_until( sequential->ready, 
 				sequential->lock, time ) )
@@ -195,6 +195,14 @@ vips_sequential_generate( VipsRegion *or,
 				break;
 #endif
 
+			/* We may have woken up because of an eval error.
+			 */
+			if( sequential->error ) {
+				g_mutex_unlock( sequential->lock );
+				return( -1 );
+			}
+		}
+
 		VIPS_GATE_STOP( "vips_sequential_generate: wait" );
 
 		VIPS_DEBUG_MSG_GREEN( "thread %p awake again ...\n", 
@@ -220,7 +228,7 @@ vips_sequential_generate( VipsRegion *or,
 		area.width = 1;
 		area.height = r->top - sequential->y_pos;
 		if( vips_region_prepare( ir, &area ) ) {
-			VIPS_DEBUG_MSG( "thread %p error, unlocking ...\n", 
+			VIPS_DEBUG_MSG( "thread %p error, unlocking #1 ...\n", 
 				g_thread_self() ); 
 			sequential->error = -1;
 			g_cond_broadcast( sequential->ready );
@@ -237,7 +245,7 @@ vips_sequential_generate( VipsRegion *or,
 	VIPS_DEBUG_MSG_GREEN( "thread %p reading ...\n", g_thread_self() ); 
 	if( vips_region_prepare( ir, r ) ||
 		vips_region_region( or, ir, r, r->left, r->top ) ) {
-		VIPS_DEBUG_MSG( "thread %p error, unlocking ...\n", 
+		VIPS_DEBUG_MSG( "thread %p error, unlocking #2 ...\n", 
 			g_thread_self() ); 
 		sequential->error = -1;
 		g_cond_broadcast( sequential->ready );
diff --git a/libvips/convolution/Makefile.am b/libvips/convolution/Makefile.am
index 32f6eec1..d6346aa5 100644
--- a/libvips/convolution/Makefile.am
+++ b/libvips/convolution/Makefile.am
@@ -6,15 +6,15 @@ libconvolution_la_SOURCES = \
 	correlation.c \
 	correlation.h \
 	conv.c \
+	conva.c \
+	convf.c \
+	convi.c \
+	convasep.c \
 	convsep.c \
 	compass.c \
 	fastcor.c \
 	spcor.c \
 	sharpen.c \
-	gaussblur.c \
-	im_aconv.c \
-	im_aconvsep.c \
-	im_conv.c \
-	im_conv_f.c 
+	gaussblur.c 
 
 AM_CPPFLAGS = -I${top_srcdir}/libvips/include @VIPS_CFLAGS@ @VIPS_INCLUDES@ 
diff --git a/libvips/convolution/conv.c b/libvips/convolution/conv.c
index ff7dfd51..1f247b6a 100644
--- a/libvips/convolution/conv.c
+++ b/libvips/convolution/conv.c
@@ -66,18 +66,15 @@ vips_conv_build( VipsObject *object )
 	VipsObjectClass *class = VIPS_OBJECT_GET_CLASS( object );
 	VipsConvolution *convolution = (VipsConvolution *) object;
 	VipsConv *conv = (VipsConv *) object;
-	VipsImage **t = (VipsImage **) 
-		vips_object_local_array( object, 4 );
+	VipsImage **t = (VipsImage **) vips_object_local_array( object, 4 );
 
 	VipsImage *in;
-	INTMASK *imsk;
-	DOUBLEMASK *dmsk;
-
-	g_object_set( conv, "out", vips_image_new(), NULL ); 
 
 	if( VIPS_OBJECT_CLASS( vips_conv_parent_class )->build( object ) )
 		return( -1 );
 
+	g_object_set( conv, "out", vips_image_new(), NULL ); 
+
 	in = convolution->in;
 
 	/*
@@ -85,13 +82,6 @@ vips_conv_build( VipsObject *object )
 	vips_matrixprint( convolution->M, NULL ); 
  	 */
 
-	if( !(imsk = im_vips2imask( convolution->M, class->nickname )) || 
-		!im_local_imask( convolution->out, imsk ) )
-		return( -1 ); 
-	if( !(dmsk = im_vips2mask( convolution->M, class->nickname )) || 
-		!im_local_dmask( convolution->out, dmsk ) )
-		return( -1 ); 
-
 	/* Unpack for processing.
 	 */
 	if( vips_image_decode( in, &t[0] ) )
@@ -99,20 +89,30 @@ vips_conv_build( VipsObject *object )
 	in = t[0];
 
 	switch( conv->precision ) { 
-	case VIPS_PRECISION_INTEGER:
-		if( im_conv( in, convolution->out, imsk ) )
+	case VIPS_PRECISION_FLOAT:
+		if( vips_convf( in, &t[1], convolution->M, NULL ) ||
+			vips_image_write( t[1], convolution->out ) )
 			return( -1 ); 
 		break;
 
-	case VIPS_PRECISION_FLOAT:
-		if( im_conv_f( in, convolution->out, dmsk ) )
+	case VIPS_PRECISION_INTEGER:
+		if( vips_convi( in, &t[1], convolution->M, NULL ) ||
+			vips_image_write( t[1], convolution->out ) )
 			return( -1 ); 
 		break;
 
 	case VIPS_PRECISION_APPROXIMATE:
+{
+		DOUBLEMASK *dmsk;
+
+		if( !(dmsk = im_vips2mask( convolution->M, class->nickname )) || 
+			!im_local_dmask( convolution->out, dmsk ) )
+			return( -1 ); 
+
 		if( im_aconv( in, convolution->out, dmsk, 
 			conv->layers, conv->cluster ) )
 			return( -1 ); 
+}
 		break;
 
 	default:
@@ -175,31 +175,40 @@ vips_conv_init( VipsConv *conv )
  *
  * Optional arguments:
  *
- * * @precision: calculation accuracy
- * * @layers: number of layers for approximation
- * * @cluster: cluster lines closer than this distance
+ * * @precision: #VipsPrecision, calculation accuracy
+ * * @layers: %gint, number of layers for approximation
+ * * @cluster: %gint, cluster lines closer than this distance
  *
  * Convolution. 
  *
  * Perform a convolution of @in with @mask.
- * Each output pixel is
- * calculated as sigma[i]{pixel[i] * mask[i]} / scale + offset, where scale
- * and offset are part of @mask. 
+ * Each output pixel is calculated as:
  *
- * If @precision is #VIPS_PRECISION_INTEGER then the convolution is performed
- * with integer arithmetic and the output image 
+ * |[
+ * sigma[i]{pixel[i] * mask[i]} / scale + offset
+ * ]|
+ *
+ * where scale and offset are part of @mask. 
+ *
+ * If @precision is #VIPS_PRECISION_INTEGER, then 
+ * elements of @mask are converted to
+ * integers before convolution, using rint(),
+ * and the output image 
  * always has the same #VipsBandFormat as the input image. 
  *
- * Convolutions on unsigned 8-bit images are calculated with the 
- * processor's vector unit, if possible. Disable this with --vips-novector or 
- * IM_NOVECTOR.
+ * For #VIPS_FORMAT_UCHAR images, vips_conv() uses a fast vector path based on
+ * fixed-point arithmetic. This can produce slightly different results. 
+ * Disable the vector path with `--vips-novector` or `VIPS_NOVECTOR` or
+ * vips_vector_set_enabled().
  *
  * If @precision is #VIPS_PRECISION_FLOAT then the convolution is performed
  * with floating-point arithmetic. The output image 
- * is always %VIPS_FORMAT_FLOAT unless @in is %VIPS_FORMAT_DOUBLE, in which case
- * @out is also %VIPS_FORMAT_DOUBLE. 
+ * is always #VIPS_FORMAT_FLOAT unless @in is #VIPS_FORMAT_DOUBLE, in which case
+ * @out is also #VIPS_FORMAT_DOUBLE. 
  *
- * If @precision is #VIPS_PRECISION_APPROXIMATE then the output image 
+ * If @precision is #VIPS_PRECISION_APPROXIMATE then, like
+ * #VIPS_PRECISION_INTEGER, @mask is converted to int before convolution, and 
+ * the output image 
  * always has the same #VipsBandFormat as the input image. 
  *
  * Larger values for @layers give more accurate
@@ -211,6 +220,8 @@ vips_conv_init( VipsConv *conv )
  * Smaller values of @cluster will give more accurate results, but be slower
  * and use more memory. 10% of the mask radius is a good rule of thumb.
  *
+ * See also: vips_convsep().
+ *
  * Returns: 0 on success, -1 on error
  */
 int 
diff --git a/libvips/convolution/conva.c b/libvips/convolution/conva.c
new file mode 100644
index 00000000..a98b7552
--- /dev/null
+++ b/libvips/convolution/conva.c
@@ -0,0 +1,1360 @@
+/* conva ... approximate convolution
+ *
+ * This operation does an approximate convolution. 
+ *
+ * Author: John Cupitt & Nicolas Robidoux
+ * Written on: 31/5/11
+ * Modified on: 
+ * 31/5/11
+ *      - from im_aconvsep()
+ * 10/7/16
+ * 	- redone as a class
+ */
+
+/*
+
+    This file is part of VIPS.
+    
+    VIPS is free software; you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+    02110-1301  USA
+
+ */
+
+/*
+
+    These files are distributed with VIPS - http://www.vips.ecs.soton.ac.uk
+
+ */
+
+/*
+
+  See:
+
+	http://incubator.quasimondo.com/processing/stackblur.pde
+
+  This thing is a little like stackblur, but generalised to any 2D 
+  convolution.
+
+ */
+
+/*
+
+  TODO
+
+timing:
+
+$ time vips im_conv_f img_0075.jpg x2.v g2d201.con
+real	5m3.359s
+user	9m34.700s
+sys	0m1.500s
+
+$ time vips im_aconv img_0075.jpg x.v g2d201.con 10 10
+real	0m3.151s
+user	0m5.640s
+sys	0m0.100s
+
+$ vips im_subtract x.v x2.v diff.v
+$ vips im_abs diff.v abs.v
+$ vips im_max abs.v
+2.70833
+
+  	- are we handling mask offset correctly?
+
+	- could we do better with an h and a v cumulativization image? we might 
+	not need the huge intermediate we have now, since any line sum an be 
+	found with simple indexing
+
+ */
+
+/*
+ */
+#define DEBUG
+#define VIPS_DEBUG
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif /*HAVE_CONFIG_H*/
+#include <vips/intl.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <math.h>
+
+#include <vips/vips.h>
+#include <vips/vector.h>
+#include <vips/debug.h>
+#include <vips/internal.h>
+
+#include "pconvolution.h"
+
+/* Maximum number of boxes we can break the mask into. Don't have this too
+ * high, it'll make the instance huge, and gobject has a 64kb limit. 
+ */
+#define MAX_LINES (1000)
+
+/* The number of edges we consider at once in clustering. Higher values are
+ * faster, but risk pushing up average error in the result.
+ */
+#define MAX_EDGES (1000)
+
+/* A horizontal line in the mask.
+ */
+typedef struct _HLine {
+	/* Start is the left-most pixel in the line, end is one beyond the
+	 * right-most pixel. 
+	 */
+	int start;
+	int end;
+
+	/* The hlines have weights. weight 0 means this line is unused.
+	 */
+	int weight;
+} HLine;
+
+/* For clustering. A pair of hlines and their distance. An edge in a graph.
+ */
+typedef struct _Edge {
+	/* The index into boxes->hline[].
+	 */
+	int a;
+	int b;
+
+	/* The distance between them, see boxes_distance().
+	 */
+	int d;
+} Edge;
+
+/* An element of a vline.
+ */
+typedef struct _VElement {
+	/* band is the index into hline[] we add, row is the row we take 
+	 * it from.
+	 */
+	int band;
+	int row;
+
+	/* Negative lobes are made with factor -1, we also common-up repeated
+	 * additions of the same line.
+	 */
+	int factor;
+} VElement;
+
+/* A vline.
+ */
+typedef struct _VLine {
+	int band;
+	int factor;
+	int start;
+	int end;
+} VLine;
+
+/* A set of boxes. 
+ */
+typedef struct {
+	VipsConvolution parent_instance;
+
+	VipsImage *iM;
+
+	int layers;
+	int cluster;
+
+	int divisor;
+	int rounding;
+	int offset;
+
+	/* The horizontal lines we gather. hline[3] writes to band 3 in the
+	 * intermediate image. max_line is the length of the longest hline:
+	 * over 256 and we need to use an int intermediate for 8-bit images.
+	 */
+	int n_hline;
+	HLine hline[MAX_LINES];
+	int max_line;
+
+	/* During clustering, the top few edges we are considering.
+	 */
+	Edge edge[MAX_EDGES];
+
+	/* Scale and sum a set of hlines to make the final value. 
+	 */
+	int n_velement;
+	VElement velement[MAX_LINES];
+
+	/* And group those velements as vlines.
+	 */
+	int n_vline;
+	VLine vline[MAX_LINES];
+
+} VipsConva;
+
+typedef VipsConvolutionClass VipsConvaClass;
+
+G_DEFINE_TYPE( VipsConva, vips_conva, VIPS_TYPE_CONVOLUTION );
+
+/* Euclid's algorithm. Use this to common up mults.
+ */
+static int
+gcd( int a, int b )
+{
+	if( b == 0 )
+		return( abs( a ) );
+	else
+		return( gcd( b, a % b ) );
+}
+
+static void
+vips_conva_hline_start( VipsConva *conva, int x )
+{
+	conva->hline[conva->n_hline].start = x;
+	conva->hline[conva->n_hline].weight = 1;
+}
+
+static int
+vips_conva_hline_end( VipsConva *conva, int x, int y, int factor )
+{
+	VipsObjectClass *class = VIPS_OBJECT_GET_CLASS( conva );
+
+	conva->hline[conva->n_hline].end = x;
+
+	conva->velement[conva->n_velement].row = y;
+	conva->velement[conva->n_velement].band = conva->n_hline;
+	conva->velement[conva->n_velement].factor = factor;
+
+	if( conva->n_hline >= MAX_LINES - 1 ) {
+		vips_error( class->nickname, "%s", _( "mask too complex" ) );
+		return( -1 );
+	}
+	conva->n_hline += 1;
+
+	if( conva->n_velement >= MAX_LINES - 1 ) {
+		vips_error( class->nickname, "%s", _( "mask too complex" ) );
+		return( -1 );
+	}
+	conva->n_velement += 1;
+
+	return( 0 );
+}
+
+#ifdef DEBUG
+static void
+vips_conva_hprint( VipsConva *conva )
+{
+	int x, y;
+
+	printf( "hlines:\n" );
+	printf( "   n   b   r  f   w\n" );
+	for( y = 0; y < conva->n_velement; y++ ) {
+		int b = conva->velement[y].band;
+
+		printf( "%4d %3d %3d %2d %3d ", 
+			y, b, 
+			conva->velement[y].row, 
+			conva->velement[y].factor,
+			conva->hline[b].weight );
+		for( x = 0; x < 45; x++ ) {
+			int rx = x * (conva->iM->Xsize + 1) / 45;
+
+			if( rx >= conva->hline[b].start && 
+				rx < conva->hline[b].end )
+				printf( "#" );
+			else
+				printf( " " );
+		}
+		printf( " %3d .. %3d\n", 
+			conva->hline[b].start, conva->hline[b].end );
+	}
+}
+
+static void
+vips_conva_vprint( VipsConva *conva )
+{
+	int y;
+
+	printf( "%d vlines:\n", conva->n_vline );
+	printf( "   n  b  f      s      e\n" );
+	for( y = 0; y < conva->n_vline; y++ ) 
+		printf( "%4d %2d %2d == %3d .. %3d\n", y,
+			conva->vline[y].band, 
+			conva->vline[y].factor, 
+			conva->vline[y].start, 
+			conva->vline[y].end );
+
+	printf( "divisor = %d\n", conva->divisor );
+	printf( "rounding = %d\n", conva->rounding );
+	printf( "offset = %d\n", conva->offset );
+	printf( "max_line = %d\n", conva->max_line );
+}
+#endif /*DEBUG*/
+
+/* Break the mask into a set of hlines.
+ */
+static int
+vips_conva_decompose_hlines( VipsConva *conva )
+{
+	VipsImage *iM = conva->iM;
+	const int size = iM->Xsize * iM->Ysize;
+	double *coeff = VIPS_MATRIX( iM, 0, 0 ); 
+
+	double max;
+	double min;
+	double depth;
+	int layers_above;
+	int layers_below;
+	int z, n, x, y;
+
+	/* Find mask range. We must always include the zero axis in the mask.
+	 */
+	max = 0;
+	min = 0;
+	for( n = 0; n < size; n++ ) {
+		max = VIPS_MAX( max, coeff[n] );
+		min = VIPS_MIN( min, coeff[n] );
+	}
+
+	VIPS_DEBUG_MSG( "vips_conva_decompose_hlines: min = %g, max = %g\n", 
+		min, max );
+
+	/* The zero axis must fall on a layer boundary. Estimate the
+	 * depth, find n-lines-above-zero, get exact depth, then calculate a
+	 * fixed n-lines which includes any negative parts.
+	 */
+	depth = (max - min) / conva->layers;
+	layers_above = VIPS_CEIL( max / depth );
+	depth = max / layers_above;
+	layers_below = VIPS_FLOOR( min / depth );
+	conva->layers = layers_above - layers_below;
+
+	VIPS_DEBUG_MSG( "vips_conva_decompose_hlines: depth = %g, layers = %d\n",
+		depth, conva->layers );
+
+	/* For each layer, generate a set of lines which are inside the
+	 * perimeter. Work down from the top.
+	 */
+	for( z = 0; z < conva->layers; z++ ) {
+		/* How deep we are into the mask, as a double we can test
+		 * against. Add half the layer depth so we can easily find >50%
+		 * mask elements.
+		 */
+		double z_ph = max - (1 + z) * depth + depth / 2;
+
+		/* Odd, but we must avoid rounding errors that make us miss 0
+		 * in the line above.
+		 */
+		int z_positive = z < layers_above;
+
+		for( y = 0; y < iM->Ysize; y++ ) {
+			int inside;
+
+			/* Start outside the perimeter.
+			 */
+			inside = 0;
+
+			for( x = 0; x < iM->Xsize; x++ ) {
+				double c = coeff[x + y * iM->Xsize];
+
+				/* The vertical line from mask[x, y] to 0 is 
+				 * inside. Is our current square (x, y) part 
+				 * of that line?
+				 */
+				if( (z_positive && c >= z_ph) ||
+					(!z_positive && c <= z_ph) ) {
+					if( !inside ) {
+						vips_conva_hline_start( conva,
+							x );
+						inside = 1;
+					}
+				}
+				else {
+					if( inside ) {
+						if( vips_conva_hline_end( conva,
+							x, y, 
+							z_positive ? 1 : -1 ) )
+							return( -1 );
+						inside = 0;
+					}
+				}
+			}
+
+			if( inside && 
+				vips_conva_hline_end( conva, 
+					iM->Xsize, y, z_positive ? 1 : -1 ) )
+				return( -1 );
+		}
+	}
+
+#ifdef DEBUG
+	VIPS_DEBUG_MSG( "vips_conva_decompose_hlines: generated %d hlines\n", 
+		conva->n_hline );
+	vips_conva_hprint( conva );
+#endif /*DEBUG*/
+
+	return( 0 );
+}
+
+/* The 'distance' between a pair of hlines.
+ */
+static int
+vips_conva_distance( VipsConva *conva, int a, int b )
+{
+	g_assert( conva->hline[a].weight > 0 && conva->hline[b].weight > 0 );
+
+	return( abs( conva->hline[a].start - conva->hline[b].start ) + 
+		abs( conva->hline[a].end - conva->hline[b].end ) ); 
+}
+
+/* Merge two hlines. Line b is deleted, and any refs to b in vlines updated to
+ * point at a.
+ */
+static void
+vips_conva_merge( VipsConva *conva, int a, int b )
+{
+	int i;
+
+	/* Scale weights. 
+	 */
+	int fa = conva->hline[a].weight;
+	int fb = conva->hline[b].weight;
+	double w = (double) fb / (fa + fb);
+
+	/* New endpoints.
+	 */
+	conva->hline[a].start += w * 
+		(conva->hline[b].start - conva->hline[a].start);
+	conva->hline[a].end += w * 
+		(conva->hline[b].end - conva->hline[a].end);
+	conva->hline[a].weight += conva->hline[b].weight;
+
+	/* Update velement refs to b to refer to a instead.
+	 */
+	for( i = 0; i < conva->n_velement; i++ )
+		if( conva->velement[i].band == b )
+			conva->velement[i].band = a;
+
+	/* Mark b to be deleted.
+	 */
+	conva->hline[b].weight = 0;
+}
+
+static int
+edge_sortfn( const void *p1, const void *p2 )
+{
+	Edge *a = (Edge *) p1;
+	Edge *b = (Edge *) p2;
+
+	return( a->d - b->d );
+}
+
+/* Cluster in batches. Return non-zero if we merged some lines.
+ *
+ * This is not as accurate as rescanning the whole space on every merge, but
+ * it's far faster.
+ */
+static int
+vips_conva_cluster2( VipsConva *conva )
+{
+	int i, j, k;
+	int worst;
+	int worst_i;
+	int merged;
+
+	for( i = 0; i < MAX_EDGES; i++ ) {
+		conva->edge[i].a = -1;
+		conva->edge[i].b = -1;
+		conva->edge[i].d = 99999;
+	}
+	worst_i = 0;
+	worst = conva->edge[worst_i].d;
+
+	for( i = 0; i < conva->n_hline; i++ ) {
+		if( conva->hline[i].weight == 0 )
+			continue;
+
+		for( j = i + 1; j < conva->n_hline; j++ ) {
+			int distance;
+
+			if( conva->hline[j].weight == 0 )
+				continue;
+
+			distance = vips_conva_distance( conva, i, j ); 
+			if( distance < worst ) {
+				conva->edge[worst_i].a = i;
+				conva->edge[worst_i].b = j;
+				conva->edge[worst_i].d = distance;
+
+				worst_i = 0;
+				worst = conva->edge[worst_i].d;
+				for( k = 0; k < MAX_EDGES; k++ )
+					if( conva->edge[k].d > worst ) {
+						worst = conva->edge[k].d;
+						worst_i = k;
+					}
+			}
+		}
+	}
+
+	/* Sort to get closest first.
+	 */
+	qsort( conva->edge, MAX_EDGES, sizeof( Edge ), edge_sortfn );
+
+	/*
+	printf( "edges:\n" );
+	printf( "  n   a   b  d:\n" );
+	for( i = 0; i < MAX_EDGES; i++ )
+		printf( "%2i) %3d %3d %3d\n", i, 
+			conva->edge[i].a, conva->edge[i].b, conva->edge[i].d );
+	 */
+
+	/* Merge from the top down.
+	 */
+	merged = 0;
+	for( k = 0; k < MAX_EDGES; k++ ) {
+		Edge *edge = &conva->edge[k];
+
+		if( edge->d > conva->cluster )
+			break;
+
+		/* Has been removed, see loop below.
+		 */
+		if( edge->a == -1 )
+			continue;
+
+		vips_conva_merge( conva, edge->a, edge->b );
+		merged = 1;
+
+		/* Nodes a and b have vanished or been moved. Remove any edges
+		 * which refer to them from the edge list,
+		 */
+		for( i = k; i < MAX_EDGES; i++ ) {
+			Edge *edgei = &conva->edge[i];
+
+			if( edgei->a == edge->a ||
+				edgei->b == edge->a ||
+				edgei->a == edge->b ||
+				edgei->b == edge->b )
+				edgei->a = -1;
+		}
+	}
+
+	return( merged );
+}
+
+/* Renumber after clustering. We will have removed a lot of hlines ... shuffle
+ * the rest down, adjust all the vline references.
+ */
+static void
+vips_conva_renumber( VipsConva *conva )
+{
+	int i, j;
+
+	VIPS_DEBUG_MSG( "vips_conva_renumber: renumbering ...\n" );
+
+	/* Loop for all zero-weight hlines.
+	 */
+	for( i = 0; i < conva->n_hline; ) {
+		if( conva->hline[i].weight > 0 ) {
+			i++;
+			continue;
+		}
+
+		/* We move hlines i + 1 down, so we need to adjust all
+		 * band[] refs to match.
+		 */
+		for( j = 0; j < conva->n_velement; j++ )
+			if( conva->velement[j].band > i ) 
+				conva->velement[j].band -= 1;
+
+		memmove( conva->hline + i, conva->hline + i + 1, 
+			sizeof( HLine ) * (conva->n_hline - i - 1) );
+		conva->n_hline -= 1;
+	}
+
+
+	VIPS_DEBUG_MSG( "vips_conva_renumber: ... %d hlines remain\n", 
+		conva->n_hline );
+}
+
+/* Sort by band, then factor, then row.
+ */
+static int
+velement_sortfn( const void *p1, const void *p2 )
+{
+	VElement *a = (VElement *) p1;
+	VElement *b = (VElement *) p2;
+
+	if( a->band != b->band )
+		return( a->band - b->band );
+
+	if( a->factor != b->factor )
+		return( a->factor - b->factor );
+
+	return( a->row - b->row );
+}
+
+static void
+vips_conva_vline( VipsConva *conva )
+{
+	int y, z;
+
+	VIPS_DEBUG_MSG( "vips_conva_vline: forming vlines ...\n" );
+
+	/* Sort to get elements which could form a vline together.
+	 */
+	qsort( conva->velement, conva->n_velement, sizeof( VElement ), 
+		velement_sortfn );
+
+#ifdef DEBUG
+	vips_conva_hprint( conva );
+#endif /*DEBUG*/
+
+	/* If two lines have the same row and band, we can join them and knock
+	 * up the factor instead.
+	 */
+	for( y = 0; y < conva->n_velement; y++ ) {
+		for( z = y + 1; z < conva->n_velement; z++ )
+			if( conva->velement[z].band != 
+				conva->velement[y].band ||
+				conva->velement[z].row != 
+					conva->velement[y].row )
+				break;
+
+		/* We need to keep the sign of the old factor.
+		 */
+		if( conva->velement[y].factor > 0 )
+			conva->velement[y].factor = z - y;
+		else
+			conva->velement[y].factor = y - z;
+		memmove( conva->velement + y + 1, conva->velement + z,
+			sizeof( VElement ) * (conva->n_velement - z) );
+		conva->n_velement -= z - y - 1;
+	}
+
+#ifdef DEBUG
+	printf( "after commoning up, %d velement remain\n", conva->n_velement );
+	vips_conva_hprint( conva );
+#endif /*DEBUG*/
+
+	conva->n_vline = 0;
+	for( y = 0; y < conva->n_velement; ) {
+		int n = conva->n_vline;
+
+		/* Start of a line.
+		 */
+		conva->vline[n].band = conva->velement[y].band;
+		conva->vline[n].factor = conva->velement[y].factor;
+		conva->vline[n].start = conva->velement[y].row;
+
+		/* Search for the end of this line.
+		 */
+		for( z = y + 1; z < conva->n_velement; z++ ) 
+			if( conva->velement[z].band != 
+					conva->vline[n].band ||
+				conva->velement[z].factor != 
+					conva->vline[n].factor ||
+				conva->velement[z].row != 
+					conva->vline[n].start + z - y )
+				break;
+
+		/* So the line ends at the previously examined element. We
+		 * want 'end' to be one beyond that (non-inclusive).
+		 */
+		conva->vline[n].end = conva->velement[z - 1].row + 1;
+
+		conva->n_vline += 1;
+		y = z;
+	}
+
+	VIPS_DEBUG_MSG( "vips_conva_vline: found %d vlines\n", conva->n_vline );
+}
+
+/* Break a mask into boxes.
+ */
+static int
+vips_conva_decompose_boxes( VipsConva *conva )
+{
+	VipsObjectClass *class = VIPS_OBJECT_GET_CLASS( conva );
+	VipsImage *iM = conva->iM;
+	double *coeff = VIPS_MATRIX( iM, 0, 0 ); 
+	const int size = iM->Xsize * iM->Ysize;
+	double scale = vips_image_get_scale( iM ); 
+	double offset = vips_image_get_offset( iM ); 
+
+	double sum;
+	double area;
+	int x, y, z;
+
+	if( vips_conva_decompose_hlines( conva ) )
+		return( -1 );
+
+	/* Cluster to find groups of lines.
+	 */
+	VIPS_DEBUG_MSG( "vips_conva_decompose_boxes: "
+		"clustering hlines with thresh %d ...\n", conva->cluster );
+	while( vips_conva_cluster2( conva ) )
+		;
+
+	/* Renumber to remove holes created by clustering.
+	 */
+	vips_conva_renumber( conva );
+
+	/* Find a set of vlines for the remaining hlines.
+	 */
+	vips_conva_vline( conva );
+
+	/* Find the area of the lines and the length of the longest hline. We
+	 * find the absolute area, we don't want -ves to cancel.
+	 */
+	area = 0;
+	conva->max_line = 0;
+	for( y = 0; y < conva->n_velement; y++ ) {
+		x = conva->velement[y].band;
+		z = conva->hline[x].end - conva->hline[x].start;
+
+		area += abs( conva->velement[y].factor * z );
+		if( z > conva->max_line )
+			conva->max_line = z;
+	}
+
+	/* Strength reduction: if all lines are divisible by n, we can move
+	 * that n out into the area factor. The aim is to produce as many
+	 * factor 1 lines as we can and to reduce the chance of overflow.
+	 */
+	x = conva->velement[0].factor;
+	for( y = 1; y < conva->n_velement; y++ ) 
+		x = gcd( x, conva->velement[y].factor );
+	for( y = 0; y < conva->n_velement; y++ ) 
+		conva->velement[y].factor /= x;
+	area *= x;
+
+	/* Find the area of the original mask. Again, don't let -ves cancel.
+	 */
+	sum = 0;
+	for( z = 0; z < size; z++ ) 
+		sum += abs( coeff[z] );
+
+	conva->divisor = VIPS_RINT( area * scale / sum );
+	conva->rounding = (conva->divisor + 1) / 2;
+	conva->offset = offset;
+
+#ifdef DEBUG
+	vips_conva_hprint( conva );
+	vips_conva_vprint( conva );
+#endif /*DEBUG*/
+
+	/* With 512x512 tiles, each hline requires 3mb of intermediate per
+	 * thread ... 300 lines is about a gb per thread, ouch.
+	 */
+	if( conva->n_hline > 150 ) {
+		vips_error( class->nickname, "%s", _( "mask too complex" ) );
+		return( -1 );
+	}
+
+	return( 0 );
+}
+
+/* Our sequence value.
+ */
+typedef struct {
+	VipsConva *conva;
+
+	VipsRegion *ir;		/* Input region */
+
+	/* Offsets for start and stop. 
+	 */
+	int *start;		
+	int *end;
+
+	int last_stride;	/* Avoid recalcing offsets, if we can */
+
+	/* The rolling sums. int for integer types, double for floating point 
+	 * types.
+	 */
+	void *sum;		
+} VipsConvaSeq;
+
+/* Free a sequence value.
+ */
+static int
+vips_conva_stop( void *vseq, void *a, void *b )
+{
+	VipsConvaSeq *seq = (VipsConvaSeq *) vseq;
+
+	VIPS_UNREF( seq->ir );
+
+	return( 0 );
+}
+
+/* Convolution start function.
+ */
+static void *
+vips_conva_start( VipsImage *out, void *a, void *b )
+{
+	VipsImage *in = (VipsImage *) a;
+	VipsConva *conva = (VipsConva *) b;
+
+	VipsConvaSeq *seq;
+
+	seq = VIPS_NEW( out, VipsConvaSeq );
+	seq->conva = conva;
+	seq->ir = vips_region_new( in );
+
+	/* n_velement should be the largest possible dimension.
+	 */
+	g_assert( conva->n_velement >= conva->n_hline );
+	g_assert( conva->n_velement >= conva->n_vline );
+
+	seq->start = VIPS_ARRAY( out, conva->n_velement, int );
+	seq->end = VIPS_ARRAY( out, conva->n_velement, int );
+
+	if( vips_band_format_isint( out->BandFmt ) )
+		seq->sum = VIPS_ARRAY( out, conva->n_velement, int );
+	else
+		seq->sum = VIPS_ARRAY( out, conva->n_velement, double );
+	seq->last_stride = -1;
+
+	return( seq );
+}
+
+/* The h and v loops are very similar, but also annoyingly different. Keep
+ * them separate for easy debugging.
+ */
+
+#define HCONV( IN, OUT ) \
+G_STMT_START { \
+	for( i = 0; i < bands; i++ ) { \
+		OUT *seq_sum = (OUT *) seq->sum; \
+		\
+		IN *p; \
+		OUT *q; \
+		\
+		p = i + (IN *) VIPS_REGION_ADDR( ir, r->left, r->top + y ); \
+		q = i * n_hline + \
+			(OUT *) VIPS_REGION_ADDR( or, r->left, r->top + y ); \
+		\
+		for( z = 0; z < n_hline; z++ ) { \
+			seq_sum[z] = 0; \
+			for( x = conva->hline[z].start; \
+				x < conva->hline[z].end; x++ ) \
+				seq_sum[z] += p[x * istride]; \
+			q[z] = seq_sum[z]; \
+		} \
+		q += ostride; \
+		\
+		for( x = 1; x < r->width; x++ ) {  \
+			for( z = 0; z < n_hline; z++ ) { \
+				seq_sum[z] += p[seq->end[z]]; \
+				seq_sum[z] -= p[seq->start[z]]; \
+				q[z] = seq_sum[z]; \
+			} \
+			p += istride; \
+			q += ostride; \
+		} \
+	} \
+} G_STMT_END
+
+/* Do horizontal masks ... we scan the mask along scanlines.
+ */
+static int
+vips_conva_hgenerate( VipsRegion *or, void *vseq, 
+	void *a, void *b, gboolean *stop )
+{
+	VipsConvaSeq *seq = (VipsConvaSeq *) vseq;
+	VipsImage *in = (VipsImage *) a;
+	VipsConva *conva = (VipsConva *) b;
+
+	VipsRegion *ir = seq->ir;
+	const int n_hline = conva->n_hline;
+	VipsImage *iM = conva->iM;
+	VipsRect *r = &or->valid;
+
+	/* Double the bands (notionally) for complex.
+	 */
+	int bands = vips_band_format_iscomplex( in->BandFmt ) ? 
+		2 * in->Bands : in->Bands;
+
+	VipsRect s;
+	int x, y, z, i;
+	int istride;
+	int ostride;
+
+	/* Prepare the section of the input image we need. A little larger
+	 * than the section of the output image we are producing.
+	 */
+	s = *r;
+	s.width += iM->Xsize - 1;
+	if( vips_region_prepare( ir, &s ) )
+		return( -1 );
+
+	istride = VIPS_IMAGE_SIZEOF_PEL( in ) / 
+		VIPS_IMAGE_SIZEOF_ELEMENT( in );
+	ostride = VIPS_IMAGE_SIZEOF_PEL( or->im ) / 
+		VIPS_IMAGE_SIZEOF_ELEMENT( or->im );
+
+        /* Init offset array. 
+	 */
+	if( seq->last_stride != istride ) {
+		seq->last_stride = istride;
+
+		for( z = 0; z < n_hline; z++ ) {
+			seq->start[z] = conva->hline[z].start * istride;
+			seq->end[z] = conva->hline[z].end * istride;
+		}
+	}
+
+	for( y = 0; y < r->height; y++ ) { 
+		switch( in->BandFmt ) {
+		case VIPS_FORMAT_UCHAR: 	
+			if( conva->max_line < 256 )
+				HCONV( unsigned char, unsigned short );
+			else
+				HCONV( unsigned char, unsigned int );
+			break;
+
+		case VIPS_FORMAT_CHAR: 	
+			if( conva->max_line < 256 )
+				HCONV( signed char, signed short );
+			else
+				HCONV( signed char, signed int );
+			break;
+
+		case VIPS_FORMAT_USHORT: 	
+			HCONV( unsigned short, unsigned int );
+			break;
+
+		case VIPS_FORMAT_SHORT: 	
+			HCONV( signed short, signed int );
+			break;
+
+		case VIPS_FORMAT_UINT: 	
+			HCONV( unsigned int, unsigned int );
+			break;
+
+		case VIPS_FORMAT_INT: 	
+			HCONV( signed int, signed int );
+			break;
+
+		case VIPS_FORMAT_FLOAT: 	
+			HCONV( float, float );
+			break;
+
+		case VIPS_FORMAT_DOUBLE: 	
+			HCONV( double, double );
+			break;
+
+		case VIPS_FORMAT_COMPLEX: 	
+			HCONV( float, float );
+			break;
+
+		case VIPS_FORMAT_DPCOMPLEX: 	
+			HCONV( double, double );
+			break;
+
+		default:
+			g_assert_not_reached();
+		}
+	}
+
+	return( 0 );
+}
+
+static int
+vips_conva_horizontal( VipsConva *conva, VipsImage *in, VipsImage **out )
+{
+	VipsObjectClass *class = VIPS_OBJECT_GET_CLASS( conva );
+
+	/* Prepare output. Consider a 7x7 mask and a 7x7 image --- the output
+	 * would be 1x1.
+	 */
+	*out = vips_image_new(); 
+	if( vips_image_pipelinev( *out, 
+		VIPS_DEMAND_STYLE_SMALLTILE, in, NULL ) )
+		return( -1 );
+
+	(*out)->Xsize -= conva->iM->Xsize - 1;
+	if( (*out)->Xsize <= 0 ) { 
+		vips_error( class->nickname, 
+			"%s", _( "image too small for mask" ) );
+		return( -1 );
+	}
+	(*out)->Bands *= conva->n_hline;
+
+	/* Short u?char lines can use u?short intermediate.
+	 */
+	if( vips_band_format_isuint( in->BandFmt ) )
+		(*out)->BandFmt = conva->max_line < 256 ? 
+			VIPS_FORMAT_USHORT : VIPS_FORMAT_UINT;
+	else if( vips_band_format_isint( in->BandFmt ) )
+		(*out)->BandFmt = conva->max_line < 256 ? 
+			VIPS_FORMAT_SHORT : VIPS_FORMAT_INT;
+
+	if( vips_image_generate( *out, 
+		vips_conva_start, vips_conva_hgenerate, vips_conva_stop, 
+		in, conva ) )
+		return( -1 );
+
+	return( 0 );
+}
+
+#define CLIP_UCHAR( V ) \
+G_STMT_START { \
+	if( (V) < 0 ) \
+		(V) = 0; \
+	else if( (V) > UCHAR_MAX ) \
+		(V) = UCHAR_MAX; \
+} G_STMT_END
+
+#define CLIP_CHAR( V ) \
+G_STMT_START { \
+	if( (V) < SCHAR_MIN ) \
+		(V) = SCHAR_MIN; \
+	else if( (V) > SCHAR_MAX ) \
+		(V) = SCHAR_MAX; \
+} G_STMT_END
+
+#define CLIP_USHORT( V ) \
+G_STMT_START { \
+	if( (V) < 0 ) \
+		(V) = 0; \
+	else if( (V) > USHRT_MAX ) \
+		(V) = USHRT_MAX; \
+} G_STMT_END
+
+#define CLIP_SHORT( V ) \
+G_STMT_START { \
+	if( (V) < SHRT_MIN ) \
+		(V) = SHRT_MIN; \
+	else if( (V) > SHRT_MAX ) \
+		(V) = SHRT_MAX; \
+} G_STMT_END
+
+#define CLIP_NONE( V ) {}
+
+#define VCONV( ACC, IN, OUT, CLIP ) \
+G_STMT_START { \
+	for( x = 0; x < sz; x++ ) { \
+		ACC *seq_sum = (ACC *) seq->sum; \
+		\
+		IN *p; \
+		OUT *q; \
+		ACC sum; \
+		\
+		p = x * conva->n_hline + \
+			(IN *) VIPS_REGION_ADDR( ir, r->left, r->top ); \
+		q = x + (OUT *) VIPS_REGION_ADDR( or, r->left, r->top ); \
+		\
+		sum = 0; \
+		for( z = 0; z < n_vline; z++ ) { \
+			seq_sum[z] = 0; \
+			for( k = conva->vline[z].start; \
+				k < conva->vline[z].end; k++ ) \
+				seq_sum[z] += p[k * istride + \
+					conva->vline[z].band]; \
+			sum += conva->vline[z].factor * seq_sum[z]; \
+		} \
+		sum = (sum + conva->rounding) / conva->divisor + conva->offset; \
+		CLIP( sum ); \
+		*q = sum; \
+		q += ostride; \
+		\
+		for( y = 1; y < r->height; y++ ) { \
+			sum = 0;\
+			for( z = 0; z < n_vline; z++ ) { \
+				seq_sum[z] += p[seq->end[z]]; \
+				seq_sum[z] -= p[seq->start[z]]; \
+				sum += conva->vline[z].factor * seq_sum[z]; \
+			} \
+			p += istride; \
+			sum = (sum + conva->rounding) / conva->divisor + \
+				conva->offset; \
+			CLIP( sum ); \
+			*q = sum; \
+			q += ostride; \
+		} \
+	} \
+} G_STMT_END
+
+/* Do vertical masks ... we scan the mask down columns of pixels. 
+ */
+static int
+vips_conva_vgenerate( VipsRegion *or, void *vseq, 
+	void *a, void *b, gboolean *stop )
+{
+	VipsConvaSeq *seq = (VipsConvaSeq *) vseq;
+	VipsImage *in = (VipsImage *) a;
+	VipsConva *conva = (VipsConva *) b;
+	VipsConvolution *convolution = (VipsConvolution *) conva;
+
+	VipsRegion *ir = seq->ir;
+	const int n_vline = conva->n_vline;
+	VipsImage *iM = conva->iM;
+	VipsRect *r = &or->valid;
+
+	/* Double the width (notionally) for complex.
+	 */
+	int sz = vips_band_format_iscomplex( in->BandFmt ) ? 
+		2 * VIPS_REGION_N_ELEMENTS( or ) : VIPS_REGION_N_ELEMENTS( or );
+
+	VipsRect s;
+	int x, y, z, k;
+	int istride;
+	int ostride;
+
+	/* Prepare the section of the input image we need. A little larger
+	 * than the section of the output image we are producing.
+	 */
+	s = *r;
+	s.height += iM->Ysize - 1;
+	if( vips_region_prepare( ir, &s ) )
+		return( -1 );
+
+	istride = VIPS_REGION_LSKIP( ir ) / 
+		VIPS_IMAGE_SIZEOF_ELEMENT( in );
+	ostride = VIPS_REGION_LSKIP( or ) / 
+		VIPS_IMAGE_SIZEOF_ELEMENT( convolution->out );
+
+        /* Init offset array. 
+	 */
+	if( seq->last_stride != istride ) {
+		seq->last_stride = istride;
+
+		for( z = 0; z < n_vline; z++ ) {
+			seq->start[z] = conva->vline[z].band + 
+				conva->vline[z].start * istride;
+			seq->end[z] = conva->vline[z].band + 
+				conva->vline[z].end * istride;
+		}
+	}
+
+	switch( convolution->in->BandFmt ) {
+	case VIPS_FORMAT_UCHAR: 	
+		if( conva->max_line < 256 )
+			VCONV( unsigned int, \
+				unsigned short, unsigned char, CLIP_UCHAR );
+		else
+			VCONV( unsigned int, \
+				unsigned int, unsigned char, CLIP_UCHAR );
+		break;
+
+	case VIPS_FORMAT_CHAR: 	
+		if( conva->max_line < 256 )
+			VCONV( signed int, \
+				signed short, signed char, CLIP_CHAR );
+		else
+			VCONV( signed int, \
+				signed int, signed char, CLIP_CHAR );
+		break;
+
+	case VIPS_FORMAT_USHORT: 	
+		VCONV( unsigned int, \
+			unsigned int, unsigned short, CLIP_USHORT );
+		break;
+
+	case VIPS_FORMAT_SHORT: 	
+		VCONV( signed int, signed int, signed short, CLIP_SHORT );
+		break;
+
+	case VIPS_FORMAT_UINT: 	
+		VCONV( unsigned int, unsigned int, unsigned int, CLIP_NONE );
+		break;
+
+	case VIPS_FORMAT_INT: 	
+		VCONV( signed int, signed int, signed int, CLIP_NONE );
+		break;
+
+	case VIPS_FORMAT_FLOAT: 	
+		VCONV( float, float, float, CLIP_NONE );
+		break;
+
+	case VIPS_FORMAT_DOUBLE: 	
+		VCONV( double, double, double, CLIP_NONE );
+		break;
+
+	case VIPS_FORMAT_COMPLEX: 	
+		VCONV( float, float, float, CLIP_NONE );
+		break;
+
+	case VIPS_FORMAT_DPCOMPLEX: 	
+		VCONV( double, double, double, CLIP_NONE );
+		break;
+
+	default:
+		g_assert_not_reached();
+	}
+
+	return( 0 );
+}
+
+static int
+vips_conva_vertical( VipsConva *conva, VipsImage *in, VipsImage **out )
+{
+	VipsObjectClass *class = VIPS_OBJECT_GET_CLASS( conva );
+	VipsConvolution *convolution = (VipsConvolution *) conva;
+
+	/* Prepare output. Consider a 7x7 mask and a 7x7 image --- the output
+	 * would be 1x1.
+	 */
+	*out = vips_image_new(); 
+	if( vips_image_pipelinev( *out, 
+		VIPS_DEMAND_STYLE_SMALLTILE, in, NULL ) )
+		return( -1 );
+
+	(*out)->Ysize -= conva->iM->Ysize - 1;
+	if( (*out)->Ysize <= 0 ) { 
+		vips_error( class->nickname, 
+			"%s", _( "image too small for mask" ) );
+		return( -1 );
+	}
+	(*out)->Bands = convolution->in->Bands;
+	(*out)->BandFmt = convolution->in->BandFmt;
+
+	if( vips_image_generate( *out, 
+		vips_conva_start, vips_conva_vgenerate, vips_conva_stop, 
+		in, conva ) )
+		return( -1 );
+
+	return( 0 );
+}
+
+static int
+vips_conva_build( VipsObject *object )
+{
+	VipsConvolution *convolution = (VipsConvolution *) object;
+	VipsConva *conva = (VipsConva *) object;
+	VipsImage **t = (VipsImage **) vips_object_local_array( object, 4 );
+
+	VipsImage *in;
+
+	if( VIPS_OBJECT_CLASS( vips_conva_parent_class )->build( object ) )
+		return( -1 );
+
+	/* An int version of our mask.
+	 */
+	if( vips__image_intize( convolution->M, &t[0] ) )
+		return( -1 );
+	conva->iM = t[0]; 
+
+#ifdef DEBUG
+	printf( "vips_conva_build: iM =\n" );
+	vips_matrixprint( conva->iM, NULL );
+#endif /*DEBUG*/
+
+	in = convolution->in;
+
+	if( vips_conva_decompose_boxes( conva ) )
+	       return( -1 ); 	
+
+	g_object_set( conva, "out", vips_image_new(), NULL ); 
+	if( 
+		vips_embed( in, &t[1], 
+			t[0]->Xsize / 2, 
+			t[0]->Ysize / 2, 
+			in->Xsize + t[0]->Xsize - 1, 
+			in->Ysize + t[0]->Ysize - 1,
+			"extend", VIPS_EXTEND_COPY,
+			NULL ) ||
+		vips_conva_horizontal( conva, t[1], &t[2] ) ||
+		vips_conva_vertical( conva, t[2], &t[3] ) ||
+		vips_image_write( t[3], convolution->out ) )
+		return( -1 );
+
+	convolution->out->Xoffset = 0;
+	convolution->out->Yoffset = 0;
+
+	return( 0 );
+}
+
+static void
+vips_conva_class_init( VipsConvaClass *class )
+{
+	GObjectClass *gobject_class = G_OBJECT_CLASS( class );
+	VipsObjectClass *object_class = (VipsObjectClass *) class;
+
+	gobject_class->set_property = vips_object_set_property;
+	gobject_class->get_property = vips_object_get_property;
+
+	object_class->nickname = "conva";
+	object_class->description = _( "approximate integer convolution" );
+	object_class->build = vips_conva_build;
+
+	VIPS_ARG_INT( class, "layers", 104, 
+		_( "Layers" ), 
+		_( "Use this many layers in approximation" ),
+		VIPS_ARGUMENT_OPTIONAL_INPUT, 
+		G_STRUCT_OFFSET( VipsConva, layers ), 
+		1, 1000, 5 ); 
+
+	VIPS_ARG_INT( class, "cluster", 105, 
+		_( "Cluster" ), 
+		_( "Cluster lines closer than this in approximation" ),
+		VIPS_ARGUMENT_OPTIONAL_INPUT, 
+		G_STRUCT_OFFSET( VipsConva, cluster ), 
+		1, 100, 1 ); 
+
+}
+
+static void
+vips_conva_init( VipsConva *conva )
+{
+        conva->layers = 5;
+        conva->cluster = 1;
+}
+
+/**
+ * vips_conva:
+ * @in: input image
+ * @out: output image
+ * @mask: convolution mask
+ * @...: %NULL-terminated list of optional named arguments
+ *
+ * Optional arguments:
+ *
+ * * @layers: %gint, number of layers for approximation
+ * * @cluster: %gint, cluster lines closer than this distance
+ *
+ * Perform an approximate integer convolution of @in with @mask.
+ * This is a low-level operation, see 
+ * vips_conv() for something more convenient. 
+ *
+ * The output image 
+ * always has the same #VipsBandFormat as the input image. 
+ * Elements of @mask are converted to
+ * integers before convolution.
+ *
+ * Larger values for @layers give more accurate
+ * results, but are slower. As @layers approaches the mask radius, the
+ * accuracy will become close to exact convolution and the speed will drop to 
+ * match. For many large masks, such as Gaussian, @layers need be only 10% of
+ * this value and accuracy will still be good.
+ *
+ * Smaller values of @cluster will give more accurate results, but be slower
+ * and use more memory. 10% of the mask radius is a good rule of thumb.
+ *
+ * See also: vips_conv().
+ *
+ * Returns: 0 on success, -1 on error
+ */
+int 
+vips_conva( VipsImage *in, VipsImage **out, VipsImage *mask, ... )
+{
+	va_list ap;
+	int result;
+
+	va_start( ap, mask );
+	result = vips_call_split( "conva", ap, in, out, mask );
+	va_end( ap );
+
+	return( result );
+}
+
diff --git a/libvips/convolution/convasep.c b/libvips/convolution/convasep.c
new file mode 100644
index 00000000..5f5209a6
--- /dev/null
+++ b/libvips/convolution/convasep.c
@@ -0,0 +1,958 @@
+/* convasep ... separable approximate convolution
+ *
+ * This operation does an approximate, seperable convolution. 
+ *
+ * Author: John Cupitt & Nicolas Robidoux
+ * Written on: 31/5/11
+ * Modified on: 
+ * 31/5/11
+ *      - from im_conv()
+ * 5/7/16
+ * 	- redone as a class
+ */
+
+/*
+
+    This file is part of VIPS.
+    
+    VIPS is free software; you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+    02110-1301  USA
+
+ */
+
+/*
+
+    These files are distributed with VIPS - http://www.vips.ecs.soton.ac.uk
+
+ */
+
+/*
+
+  See:
+
+	http://incubator.quasimondo.com/processing/stackblur.pde
+
+  This thing is a little like stackblur, but generalised to any separable 
+  mask.
+
+ */
+
+/*
+
+  TODO
+
+	- how about making a cumulative image and then subtracting points in 
+	  that, rather than keeping a set of running totals
+
+	  faster?
+
+	  we could then use orc to write a bit of code to implement this set 
+	  of lines
+
+	  stackoverflow has an algorithm for cumulativization using SIMD and 
+	  threads, see that font rasterization with rust piece on medium by 
+	  ralph levien
+
+ */
+
+/*
+#define DEBUG
+#define VIPS_DEBUG
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif /*HAVE_CONFIG_H*/
+#include <vips/intl.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <math.h>
+
+#include <vips/vips.h>
+#include <vips/vector.h>
+#include <vips/debug.h>
+#include <vips/internal.h>
+
+#include "pconvolution.h"
+
+/* Maximum number of lines we can break the mask into.
+ */
+#define MAX_LINES (1000)
+
+/* Euclid's algorithm. Use this to common up mults.
+ */
+static int
+gcd( int a, int b )
+{
+	if( b == 0 )
+		return( abs( a ) );
+	else
+		return( gcd( b, a % b ) );
+}
+
+typedef struct {
+	VipsConvolution parent_instance;
+
+	int layers;
+
+	int divisor;
+	int rounding;
+	int offset;
+
+	/* The "width" of the mask, ie. n for our 1xn or nx1 argument, plus 
+	 * an int version of our mask. 
+	 */
+	int width;
+	VipsImage *iM;
+
+	/* The mask broken into a set of lines.
+	 *
+	 * Start is the left-most pixel in the line, end is one beyond the
+	 * right-most pixel.
+	 */
+	int n_lines;
+	int start[MAX_LINES];
+	int end[MAX_LINES];
+	int factor[MAX_LINES];
+} VipsConvasep;
+
+typedef VipsConvolutionClass VipsConvasepClass;
+
+G_DEFINE_TYPE( VipsConvasep, vips_convasep, VIPS_TYPE_CONVOLUTION );
+
+static void
+vips_convasep_line_start( VipsConvasep *convasep, int x, int factor )
+{
+	convasep->start[convasep->n_lines] = x;
+	convasep->factor[convasep->n_lines] = factor;
+}
+
+static int
+vips_convasep_line_end( VipsConvasep *convasep, int x )
+{
+	VipsObjectClass *class = VIPS_OBJECT_GET_CLASS( convasep );
+
+	convasep->end[convasep->n_lines] = x;
+
+	if( convasep->n_lines >= MAX_LINES - 1 ) {
+		vips_error( class->nickname, "%s", _( "mask too complex" ) );
+		return( -1 );
+	}
+	convasep->n_lines += 1;
+
+	return( 0 );
+}
+
+/* Break a mask into lines.
+ */
+static int
+vips_convasep_decompose( VipsConvasep *convasep )
+{
+	VipsImage *iM = convasep->iM;
+	double *coeff = (double *) VIPS_IMAGE_ADDR( iM, 0, 0 );
+	double scale = vips_image_get_scale( iM ); 
+	double offset = vips_image_get_offset( iM ); 
+
+	double max;
+	double min;
+	double depth;
+	double sum;
+	double area;
+	int layers;
+	int layers_above;
+	int layers_below;
+	int z, n, x;
+
+	VIPS_DEBUG_MSG( "vips_convasep_decompose: "
+		"breaking into %d layers ...\n", convasep->layers );
+
+	/* Find mask range. We must always include the zero axis in the mask.
+	 */
+	max = 0;
+	min = 0;
+	for( x = 0; x < convasep->width; x++ ) {
+		if( coeff[x] > max )
+			max = coeff[x];
+		if( coeff[x] < min )
+			min = coeff[x];
+	}
+
+	/* The zero axis must fall on a layer boundary. Estimate the
+	 * depth, find n-lines-above-zero, get exact depth, then calculate a
+	 * fixed n-lines which includes any negative parts.
+	 */
+	depth = (max - min) / convasep->layers;
+	layers_above = ceil( max / depth );
+	depth = max / layers_above;
+	layers_below = floor( min / depth );
+	layers = layers_above - layers_below;
+
+	VIPS_DEBUG_MSG( "depth = %g, layers = %d\n", depth, layers );
+
+	/* For each layer, generate a set of lines which are inside the
+	 * perimeter. Work down from the top.
+	 */
+	for( z = 0; z < layers; z++ ) {
+		double y = max - (1 + z) * depth;
+
+		/* y plus half depth ... ie. the layer midpoint.
+		 */
+		double y_ph = y + depth / 2;
+
+		/* Odd, but we must avoid rounding errors that make us miss 0
+		 * in the line above.
+		 */
+		int y_positive = z < layers_above;
+
+		int inside;
+
+		/* Start outside the perimeter.
+		 */
+		inside = 0;
+
+		for( x = 0; x < convasep->width; x++ ) {
+			/* The vertical line from mask[z] to 0 is inside. Is
+			 * our current square (x, y) part of that line?
+			 */
+			if( (y_positive && coeff[x] >= y_ph) ||
+				(!y_positive && coeff[x] <= y_ph) ) {
+				if( !inside ) {
+					vips_convasep_line_start( convasep, x, 
+						y_positive ? 1 : -1 );
+					inside = 1;
+				}
+			}
+			else if( inside ) {
+				if( vips_convasep_line_end( convasep, x ) )
+					return( -1 );
+				inside = 0;
+			}
+		}
+
+		if( inside && 
+			vips_convasep_line_end( convasep, convasep->width ) )
+			return( -1 );
+	}
+
+	/* Can we common up any lines? Search for lines with identical
+	 * start/end.
+	 */
+	for( z = 0; z < convasep->n_lines; z++ ) {
+		for( n = z + 1; n < convasep->n_lines; n++ ) {
+			if( convasep->start[z] == convasep->start[n] &&
+				convasep->end[z] == convasep->end[n] ) {
+				convasep->factor[z] += convasep->factor[n];
+
+				/* n can be deleted. Do this in a separate
+				 * pass below.
+				 */
+				convasep->factor[n] = 0;
+			}
+		}
+	}
+
+	/* Now we can remove all factor 0 lines.
+	 */
+	for( z = 0; z < convasep->n_lines; z++ ) {
+		if( convasep->factor[z] == 0 ) {
+			for( x = z; x < convasep->n_lines; x++ ) {
+				convasep->start[x] = convasep->start[x + 1];
+				convasep->end[x] = convasep->end[x + 1];
+				convasep->factor[x] = convasep->factor[x + 1];
+			}
+			convasep->n_lines -= 1;
+		}
+	}
+
+	/* Find the area of the lines.
+	 */
+	area = 0;
+	for( z = 0; z < convasep->n_lines; z++ ) 
+		area += convasep->factor[z] * 
+			(convasep->end[z] - convasep->start[z]);
+
+	/* Strength reduction: if all lines are divisible by n, we can move
+	 * that n out into the ->area factor. The aim is to produce as many
+	 * factor 1 lines as we can and to reduce the chance of overflow.
+	 */
+	x = convasep->factor[0];
+	for( z = 1; z < convasep->n_lines; z++ ) 
+		x = gcd( x, convasep->factor[z] );
+	for( z = 0; z < convasep->n_lines; z++ ) 
+		convasep->factor[z] /= x;
+	area *= x;
+
+	/* Find the area of the original mask.
+	 */
+	sum = 0;
+	for( z = 0; z < convasep->width; z++ ) 
+		sum += coeff[z];
+
+	convasep->divisor = VIPS_RINT( sum * area / scale );
+	if( convasep->divisor == 0 )
+		convasep->divisor = 1;
+	convasep->rounding = (convasep->divisor + 1) / 2;
+	convasep->offset = offset;
+
+#ifdef DEBUG
+	/* ASCII-art layer drawing.
+	 */
+	printf( "lines:\n" );
+	for( z = 0; z < convasep->n_lines; z++ ) {
+		printf( "%3d - %2d x ", z, convasep->factor[z] );
+		for( x = 0; x < 55; x++ ) {
+			int rx = x * (convasep->width + 1) / 55;
+
+			if( rx >= convasep->start[z] && rx < convasep->end[z] )
+				printf( "#" );
+			else
+				printf( " " );
+		}
+		printf( " %3d .. %3d\n", convasep->start[z], convasep->end[z] );
+	}
+	printf( "divisor = %d\n", convasep->divisor );
+	printf( "rounding = %d\n", convasep->rounding );
+	printf( "offset = %d\n", convasep->offset );
+#endif /*DEBUG*/
+
+	return( 0 );
+}
+
+/* Our sequence value.
+ */
+typedef struct {
+	VipsConvasep *convasep;
+
+	VipsRegion *ir;		/* Input region */
+
+	int *start;		/* Offsets for start and stop */
+	int *end;
+
+	/* The sums for each line. int for integer types, double for floating
+	 * point types.
+	 */
+	int *isum;		
+	double *dsum;		
+
+	int last_stride;	/* Avoid recalcing offsets, if we can */
+} VipsConvasepSeq;
+
+/* Free a sequence value.
+ */
+static int
+vips_convasep_stop( void *vseq, void *a, void *b )
+{
+	VipsConvasepSeq *seq = (VipsConvasepSeq *) vseq;
+
+	VIPS_UNREF( seq->ir );
+	VIPS_FREE( seq->start );
+	VIPS_FREE( seq->end );
+	VIPS_FREE( seq->isum );
+	VIPS_FREE( seq->dsum );
+
+	return( 0 );
+}
+
+/* Convolution start function.
+ */
+static void *
+vips_convasep_start( VipsImage *out, void *a, void *b )
+{
+	VipsImage *in = (IMAGE *) a;
+	VipsConvasep *convasep = (VipsConvasep *) b;
+
+	VipsConvasepSeq *seq;
+
+	if( !(seq = VIPS_NEW( out, VipsConvasepSeq )) )
+		return( NULL );
+
+	/* Init!
+	 */
+	seq->convasep = convasep;
+	seq->ir = vips_region_new( in );
+	seq->start = VIPS_ARRAY( NULL, convasep->n_lines, int );
+	seq->end = VIPS_ARRAY( NULL, convasep->n_lines, int );
+	seq->isum = NULL;
+	seq->dsum = NULL;
+	if( vips_band_format_isint( out->BandFmt ) )
+		seq->isum = VIPS_ARRAY( NULL, convasep->n_lines, int );
+	else
+		seq->dsum = VIPS_ARRAY( NULL, convasep->n_lines, double );
+	seq->last_stride = -1;
+
+	if( !seq->ir || 
+		!seq->start || 
+		!seq->end || 
+		(!seq->isum && !seq->dsum) ) {
+		vips_convasep_stop( seq, in, convasep );
+		return( NULL );
+	}
+
+	return( seq );
+}
+
+#define CLIP_UCHAR( V ) \
+G_STMT_START { \
+	if( (V) < 0 ) \
+		(V) = 0; \
+	else if( (V) > UCHAR_MAX ) \
+		(V) = UCHAR_MAX; \
+} G_STMT_END
+
+#define CLIP_CHAR( V ) \
+G_STMT_START { \
+	if( (V) < SCHAR_MIN ) \
+		(V) = SCHAR_MIN; \
+	else if( (V) > SCHAR_MAX ) \
+		(V) = SCHAR_MAX; \
+} G_STMT_END
+
+#define CLIP_USHORT( V ) \
+G_STMT_START { \
+	if( (V) < 0 ) \
+		(V) = 0; \
+	else if( (V) > USHRT_MAX ) \
+		(V) = USHRT_MAX; \
+} G_STMT_END
+
+#define CLIP_SHORT( V ) \
+G_STMT_START { \
+	if( (V) < SHRT_MIN ) \
+		(V) = SHRT_MIN; \
+	else if( (V) > SHRT_MAX ) \
+		(V) = SHRT_MAX; \
+} G_STMT_END
+
+#define CLIP_NONE( V ) {}
+
+/* The h and v loops are very similar, but also annoyingly different. Keep
+ * them separate for easy debugging.
+ */
+
+#define HCONV_INT( TYPE, CLIP ) { \
+	for( i = 0; i < bands; i++ ) { \
+		int *isum = seq->isum; \
+		\
+		TYPE *q; \
+		TYPE *p; \
+		int sum; \
+		\
+		p = i + (TYPE *) VIPS_REGION_ADDR( ir, r->left, r->top + y ); \
+		q = i + (TYPE *) VIPS_REGION_ADDR( or, r->left, r->top + y ); \
+		\
+		sum = 0; \
+		for( z = 0; z < n_lines; z++ ) { \
+			isum[z] = 0; \
+			for( x = seq->start[z]; x < seq->end[z]; x += istride ) \
+				isum[z] += p[x]; \
+			sum += convasep->factor[z] * isum[z]; \
+		} \
+		\
+		/* Don't add offset ... we only want to do that once, do it on \
+		 * the vertical pass. \
+		 */ \
+		sum = (sum + convasep->rounding) / convasep->divisor; \
+		CLIP( sum ); \
+		*q = sum; \
+		q += ostride; \
+		\
+		for( x = 1; x < r->width; x++ ) {  \
+			sum = 0; \
+			for( z = 0; z < n_lines; z++ ) { \
+				isum[z] += p[seq->end[z]]; \
+				isum[z] -= p[seq->start[z]]; \
+				sum += convasep->factor[z] * isum[z]; \
+			} \
+			p += istride; \
+			sum = (sum + convasep->rounding) / convasep->divisor; \
+			CLIP( sum ); \
+			*q = sum; \
+			q += ostride; \
+		} \
+	} \
+}
+
+#define HCONV_FLOAT( TYPE ) { \
+	for( i = 0; i < bands; i++ ) { \
+		double *dsum = seq->dsum; \
+		\
+		TYPE *q; \
+		TYPE *p; \
+		double sum; \
+		\
+		p = i + (TYPE *) VIPS_REGION_ADDR( ir, r->left, r->top + y ); \
+		q = i + (TYPE *) VIPS_REGION_ADDR( or, r->left, r->top + y ); \
+		\
+		sum = 0; \
+		for( z = 0; z < n_lines; z++ ) { \
+			dsum[z] = 0; \
+			for( x = seq->start[z]; x < seq->end[z]; x += istride ) \
+				dsum[z] += p[x]; \
+			sum += convasep->factor[z] * dsum[z]; \
+		} \
+		\
+		/* Don't add offset ... we only want to do that once, do it on \
+		 * the vertical pass. \
+		 */ \
+		sum = sum / convasep->divisor; \
+		*q = sum; \
+		q += ostride; \
+		\
+		for( x = 1; x < r->width; x++ ) {  \
+			sum = 0; \
+			for( z = 0; z < n_lines; z++ ) { \
+				dsum[z] += p[seq->end[z]]; \
+				dsum[z] -= p[seq->start[z]]; \
+				sum += convasep->factor[z] * dsum[z]; \
+			} \
+			p += istride; \
+			sum = sum / convasep->divisor; \
+			*q = sum; \
+			q += ostride; \
+		} \
+	} \
+}
+
+/* Do horizontal masks ... we scan the mask along scanlines.
+ */
+static int
+vips_convasep_generate_horizontal( VipsRegion *or, 
+	void *vseq, void *a, void *b, gboolean *stop )
+{
+	VipsConvasepSeq *seq = (VipsConvasepSeq *) vseq;
+	VipsImage *in = (VipsImage *) a;
+	VipsConvasep *convasep = (VipsConvasep *) b;
+	VipsConvolution *convolution = (VipsConvolution *) convasep;
+
+	VipsRegion *ir = seq->ir;
+	const int n_lines = convasep->n_lines;
+	VipsRect *r = &or->valid;
+
+	/* Double the bands (notionally) for complex.
+	 */
+	int bands = vips_band_format_iscomplex( in->BandFmt ) ? 
+		2 * in->Bands : in->Bands;
+
+	VipsRect s;
+	int x, y, z, i;
+	int istride;
+	int ostride;
+
+	/* Prepare the section of the input image we need. A little larger
+	 * than the section of the output image we are producing.
+	 */
+	s = *r;
+	s.width += convasep->width - 1;
+	if( vips_region_prepare( ir, &s ) )
+		return( -1 );
+
+	/* Stride can be different for the vertical case, keep this here for
+	 * ease of direction change.
+	 */
+	istride = VIPS_IMAGE_SIZEOF_PEL( in ) / 
+		VIPS_IMAGE_SIZEOF_ELEMENT( in );
+	ostride = VIPS_IMAGE_SIZEOF_PEL( convolution->out ) / 
+		VIPS_IMAGE_SIZEOF_ELEMENT( convolution->out );
+
+        /* Init offset array. 
+	 */
+	if( seq->last_stride != istride ) {
+		seq->last_stride = istride;
+
+		for( z = 0; z < n_lines; z++ ) {
+			seq->start[z] = convasep->start[z] * istride;
+			seq->end[z] = convasep->end[z] * istride;
+		}
+	}
+
+	for( y = 0; y < r->height; y++ ) { 
+		switch( in->BandFmt ) {
+		case VIPS_FORMAT_UCHAR: 	
+			HCONV_INT( unsigned char, CLIP_UCHAR );
+			break;
+
+		case VIPS_FORMAT_CHAR: 	
+			HCONV_INT( signed char, CLIP_CHAR );
+			break;
+
+		case VIPS_FORMAT_USHORT: 	
+			HCONV_INT( unsigned short, CLIP_USHORT );
+			break;
+
+		case VIPS_FORMAT_SHORT: 	
+			HCONV_INT( signed short, CLIP_SHORT );
+			break;
+
+		case VIPS_FORMAT_UINT: 	
+			HCONV_INT( unsigned int, CLIP_NONE );
+			break;
+
+		case VIPS_FORMAT_INT: 	
+			HCONV_INT( signed int, CLIP_NONE );
+			break;
+
+		case VIPS_FORMAT_FLOAT: 	
+		case VIPS_FORMAT_COMPLEX: 	
+			HCONV_FLOAT( float );
+			break;
+
+		case VIPS_FORMAT_DOUBLE: 	
+		case VIPS_FORMAT_DPCOMPLEX: 	
+			HCONV_FLOAT( double );
+			break;
+
+		default:
+			g_assert_not_reached();
+		}
+	}
+
+	return( 0 );
+}
+
+#define VCONV_INT( TYPE, CLIP ) { \
+	for( x = 0; x < sz; x++ ) { \
+		int *isum = seq->isum; \
+		\
+		TYPE *q; \
+		TYPE *p; \
+		int sum; \
+		\
+		p = x + (TYPE *) VIPS_REGION_ADDR( ir, r->left, r->top ); \
+		q = x + (TYPE *) VIPS_REGION_ADDR( or, r->left, r->top ); \
+		\
+		sum = 0; \
+		for( z = 0; z < n_lines; z++ ) { \
+			isum[z] = 0; \
+			for( y = seq->start[z]; y < seq->end[z]; y += istride ) \
+				isum[z] += p[y]; \
+			sum += convasep->factor[z] * isum[z]; \
+		} \
+		sum = (sum + convasep->rounding) / convasep->divisor + \
+			convasep->offset; \
+		CLIP( sum ); \
+		*q = sum; \
+		q += ostride; \
+		\
+		for( y = 1; y < r->height; y++ ) { \
+			sum = 0; \
+			for( z = 0; z < n_lines; z++ ) { \
+				isum[z] += p[seq->end[z]]; \
+				isum[z] -= p[seq->start[z]]; \
+				sum += convasep->factor[z] * isum[z]; \
+			} \
+			p += istride; \
+			sum = (sum + convasep->rounding) / convasep->divisor + \
+				convasep->offset; \
+			CLIP( sum ); \
+			*q = sum; \
+			q += ostride; \
+		} \
+	} \
+}
+
+#define VCONV_FLOAT( TYPE ) { \
+	for( x = 0; x < sz; x++ ) { \
+		double *dsum = seq->dsum; \
+		\
+		TYPE *q; \
+		TYPE *p; \
+		double sum; \
+		\
+		p = x + (TYPE *) VIPS_REGION_ADDR( ir, r->left, r->top ); \
+		q = x + (TYPE *) VIPS_REGION_ADDR( or, r->left, r->top ); \
+		\
+		sum = 0; \
+		for( z = 0; z < n_lines; z++ ) { \
+			dsum[z] = 0; \
+			for( y = seq->start[z]; y < seq->end[z]; y += istride ) \
+				dsum[z] += p[y]; \
+			sum += convasep->factor[z] * dsum[z]; \
+		} \
+		sum = sum / convasep->divisor + convasep->offset; \
+		*q = sum; \
+		q += ostride; \
+		\
+		for( y = 1; y < r->height; y++ ) { \
+			sum = 0; \
+			for( z = 0; z < n_lines; z++ ) { \
+				dsum[z] += p[seq->end[z]]; \
+				dsum[z] -= p[seq->start[z]]; \
+				sum += convasep->factor[z] * dsum[z]; \
+			} \
+			p += istride; \
+			sum = sum / convasep->divisor + convasep->offset; \
+			*q = sum; \
+			q += ostride; \
+		} \
+	} \
+}
+
+/* Do vertical masks ... we scan the mask down columns of pixels. Copy-paste
+ * from above with small changes.
+ */
+static int
+vips_convasep_generate_vertical( VipsRegion *or, 
+	void *vseq, void *a, void *b, gboolean *stop )
+{
+	VipsConvasepSeq *seq = (VipsConvasepSeq *) vseq;
+	VipsImage *in = (VipsImage *) a;
+	VipsConvasep *convasep = (VipsConvasep *) b;
+	VipsConvolution *convolution = (VipsConvolution *) convasep;
+
+	VipsRegion *ir = seq->ir;
+	const int n_lines = convasep->n_lines;
+	VipsRect *r = &or->valid;
+
+	/* Double the width (notionally) for complex.
+	 */
+	int sz = vips_band_format_iscomplex( in->BandFmt ) ? 
+		2 * VIPS_REGION_N_ELEMENTS( or ) : VIPS_REGION_N_ELEMENTS( or );
+
+	VipsRect s;
+	int x, y, z;
+	int istride;
+	int ostride;
+
+	/* Prepare the section of the input image we need. A little larger
+	 * than the section of the output image we are producing.
+	 */
+	s = *r;
+	s.height += convasep->width - 1;
+	if( vips_region_prepare( ir, &s ) )
+		return( -1 );
+
+	/* Stride can be different for the vertical case, keep this here for
+	 * ease of direction change.
+	 */
+	istride = VIPS_REGION_LSKIP( ir ) / VIPS_IMAGE_SIZEOF_ELEMENT( in );
+	ostride = VIPS_REGION_LSKIP( or ) / 
+		VIPS_IMAGE_SIZEOF_ELEMENT( convolution->out );
+
+        /* Init offset array. 
+	 */
+	if( seq->last_stride != istride ) {
+		seq->last_stride = istride;
+
+		for( z = 0; z < n_lines; z++ ) {
+			seq->start[z] = convasep->start[z] * istride;
+			seq->end[z] = convasep->end[z] * istride;
+		}
+	}
+
+	switch( in->BandFmt ) {
+	case VIPS_FORMAT_UCHAR: 	
+		VCONV_INT( unsigned char, CLIP_UCHAR );
+		break;
+
+	case VIPS_FORMAT_CHAR: 	
+		VCONV_INT( signed char, CLIP_CHAR );
+		break;
+
+	case VIPS_FORMAT_USHORT: 	
+		VCONV_INT( unsigned short, CLIP_USHORT );
+		break;
+
+	case VIPS_FORMAT_SHORT: 	
+		VCONV_INT( signed short, CLIP_SHORT );
+		break;
+
+	case VIPS_FORMAT_UINT: 	
+		VCONV_INT( unsigned int, CLIP_NONE );
+		break;
+
+	case VIPS_FORMAT_INT: 	
+		VCONV_INT( signed int, CLIP_NONE );
+		break;
+
+	case VIPS_FORMAT_FLOAT: 	
+	case VIPS_FORMAT_COMPLEX: 	
+		VCONV_FLOAT( float );
+		break;
+
+	case VIPS_FORMAT_DOUBLE: 	
+	case VIPS_FORMAT_DPCOMPLEX: 	
+		VCONV_FLOAT( double );
+		break;
+
+	default:
+		g_assert_not_reached();
+	}
+
+	return( 0 );
+}
+
+static int
+vips_convasep_pass( VipsConvasep *convasep, 
+	VipsImage *in, VipsImage **out, VipsDirection direction )
+{
+	VipsObjectClass *class = VIPS_OBJECT_GET_CLASS( convasep );
+
+	VipsGenerateFn gen;
+
+	*out = vips_image_new(); 
+	if( vips_image_pipelinev( *out, 
+		VIPS_DEMAND_STYLE_SMALLTILE, in, NULL ) )
+		return( -1 );
+
+	if( direction == VIPS_DIRECTION_HORIZONTAL ) { 
+		(*out)->Xsize -= convasep->width - 1;
+		gen = vips_convasep_generate_horizontal;
+	}
+	else {
+		(*out)->Ysize -= convasep->width - 1;
+		gen = vips_convasep_generate_vertical;
+	}
+
+	if( (*out)->Xsize <= 0 || 
+		(*out)->Ysize <= 0 ) {
+		vips_error( class->nickname, 
+			"%s", _( "image too small for mask" ) );
+		return( -1 );
+	}
+
+	if( vips_image_generate( *out, 
+		vips_convasep_start, gen, vips_convasep_stop, in, convasep ) )
+		return( -1 );
+
+	return( 0 );
+}
+
+static int 
+vips_convasep_build( VipsObject *object )
+{
+	VipsObjectClass *class = VIPS_OBJECT_GET_CLASS( object );
+	VipsConvolution *convolution = (VipsConvolution *) object;
+	VipsConvasep *convasep = (VipsConvasep *) object;
+	VipsImage **t = (VipsImage **) vips_object_local_array( object, 4 );
+
+	VipsImage *in;
+
+	if( VIPS_OBJECT_CLASS( vips_convasep_parent_class )->build( object ) )
+		return( -1 );
+
+	if( vips_check_separable( class->nickname, convolution->M ) ) 
+                return( -1 );
+
+	/* An int version of our mask.
+	 */
+	if( vips__image_intize( convolution->M, &t[3] ) )
+		return( -1 );
+	convasep->iM = t[3]; 
+	convasep->width = convasep->iM->Xsize * convasep->iM->Ysize;
+	in = convolution->in;
+
+	if( vips_convasep_decompose( convasep ) )
+		return( -1 ); 
+
+	g_object_set( convasep, "out", vips_image_new(), NULL ); 
+	if( 
+		vips_embed( in, &t[0], 
+			convasep->width / 2, 
+			convasep->width / 2, 
+			in->Xsize + convasep->width - 1, 
+			in->Ysize + convasep->width - 1,
+			"extend", VIPS_EXTEND_COPY,
+			NULL ) ||
+		vips_convasep_pass( convasep, 
+			t[0], &t[1], VIPS_DIRECTION_HORIZONTAL ) ||
+		vips_convasep_pass( convasep, 
+			t[1], &t[2], VIPS_DIRECTION_VERTICAL ) ||
+		vips_image_write( t[2], convolution->out ) )
+		return( -1 );
+
+	convolution->out->Xoffset = 0;
+	convolution->out->Yoffset = 0;
+
+	return( 0 );
+}
+
+static void
+vips_convasep_class_init( VipsConvasepClass *class )
+{
+	GObjectClass *gobject_class = G_OBJECT_CLASS( class );
+	VipsObjectClass *object_class = (VipsObjectClass *) class;
+
+	gobject_class->set_property = vips_object_set_property;
+	gobject_class->get_property = vips_object_get_property;
+
+	object_class->nickname = "convasep";
+	object_class->description = 
+		_( "approximate separable integer convolution" );
+	object_class->build = vips_convasep_build;
+
+	VIPS_ARG_INT( class, "layers", 104, 
+		_( "Layers" ), 
+		_( "Use this many layers in approximation" ),
+		VIPS_ARGUMENT_OPTIONAL_INPUT, 
+		G_STRUCT_OFFSET( VipsConvasep, layers ), 
+		1, 1000, 5 ); 
+
+}
+
+static void
+vips_convasep_init( VipsConvasep *convasep )
+{
+        convasep->layers = 5;
+	convasep->n_lines = 0;
+}
+
+/**
+ * vips_convasep:
+ * @in: input image
+ * @out: output image
+ * @mask: convolve with this mask
+ * @...: %NULL-terminated list of optional named arguments
+ *
+ * Optional arguments:
+ *
+ * * @layers: %gint, number of layers for approximation
+ *
+ * Approximate separable integer convolution. This is a low-level operation, see 
+ * vips_convsep() for something more convenient. 
+ *
+ * The image is convolved twice: once with @mask and then again with @mask 
+ * rotated by 90 degrees. 
+ * @mask must be 1xn or nx1 elements. 
+ * Elements of @mask are converted to
+ * integers before convolution.
+ *
+ * Larger values for @layers give more accurate
+ * results, but are slower. As @layers approaches the mask radius, the
+ * accuracy will become close to exact convolution and the speed will drop to 
+ * match. For many large masks, such as Gaussian, @layers need be only 10% of
+ * this value and accuracy will still be good.
+ *
+ * The output image 
+ * always has the same #VipsBandFormat as the input image. 
+ *
+ * See also: vips_convsep().
+ *
+ * Returns: 0 on success, -1 on error
+ */
+int 
+vips_convasep( VipsImage *in, VipsImage **out, VipsImage *mask, ... )
+{
+	va_list ap;
+	int result;
+
+	va_start( ap, mask );
+	result = vips_call_split( "convasep", ap, in, out, mask );
+	va_end( ap );
+
+	return( result );
+}
+
diff --git a/libvips/convolution/convf.c b/libvips/convolution/convf.c
new file mode 100644
index 00000000..be7b59b2
--- /dev/null
+++ b/libvips/convolution/convf.c
@@ -0,0 +1,406 @@
+/* convf
+ *
+ * Copyright: 1990, N. Dessipris.
+ *
+ * Author: Nicos Dessipris & Kirk Martinez
+ * Written on: 29/04/1991
+ * Modified on: 19/05/1991
+ * 8/7/93 JC
+ *      - adapted for partial v2
+ *      - memory leaks fixed
+ *      - ANSIfied
+ * 12/7/93 JC
+ *	- adapted im_convbi() to im_convbf()
+ * 7/10/94 JC
+ *	- new IM_ARRAY() macro
+ *	- evalend callbacks
+ *	- more typedef
+ * 9/3/01 JC
+ *	- redone from im_conv() 
+ * 27/7/01 JC
+ *	- rejects masks with scale == 0
+ * 7/4/04 
+ *	- now uses im_embed() with edge stretching on the input, not
+ *	  the output
+ *	- sets Xoffset / Yoffset
+ * 11/11/05
+ * 	- simpler inner loop avoids gcc4 bug 
+ * 12/11/09
+ * 	- only rebuild the buffer offsets if bpl changes
+ * 	- tiny speedups and cleanups
+ * 	- add restrict, though it doesn't seem to help gcc
+ * 	- add mask-all-zero check
+ * 13/11/09
+ * 	- rename as im_conv_f() to make it easier for vips.c to make the
+ * 	  overloaded version
+ * 3/2/10
+ * 	- gtkdoc
+ * 	- more cleanups
+ * 1/10/10
+ * 	- support complex (just double the bands)
+ * 29/10/10
+ * 	- get rid of im_convsep_f(), just call this twice, no longer worth
+ * 	  keeping two versions
+ * 15/10/11 Nicolas
+ * 	- handle offset correctly in seperable convolutions
+ * 26/1/16 Lovell Fuller
+ * 	- remove Duff for a 25% speedup
+ * 23/6/16
+ * 	- redone as a class
+ */
+
+/*
+
+    This file is part of VIPS.
+    
+    VIPS is free software; you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+    02110-1301  USA
+
+ */
+
+/*
+
+    These files are distributed with VIPS - http://www.vips.ecs.soton.ac.uk
+
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif /*HAVE_CONFIG_H*/
+#include <vips/intl.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <limits.h>
+
+#include <vips/vips.h>
+
+#include "pconvolution.h"
+
+typedef struct {
+	VipsConvolution parent_instance;
+
+	/* We make a smaller version of the mask with the zeros squeezed out.
+	 */
+	int nnz;		/* Number of non-zero mask elements */
+	double *coeff;		/* Array of non-zero mask coefficients */
+	int *coeff_pos;		/* Index of each nnz element in mask->coeff */
+} VipsConvf;
+
+typedef VipsConvolutionClass VipsConvfClass;
+
+G_DEFINE_TYPE( VipsConvf, vips_convf, VIPS_TYPE_CONVOLUTION );
+
+/* Our sequence value.
+ */
+typedef struct {
+	VipsConvf *convf;
+	VipsRegion *ir;		/* Input region */
+
+	int *offsets;		/* Offsets for each non-zero matrix element */
+	VipsPel **pts;		/* Per-non-zero mask element image pointers */
+
+	int last_bpl;		/* Avoid recalcing offsets, if we can */
+} VipsConvfSequence;
+
+/* Free a sequence value.
+ */
+static int
+vips_convf_stop( void *vseq, void *a, void *b )
+{
+	VipsConvfSequence *seq = (VipsConvfSequence *) vseq;
+
+	VIPS_UNREF( seq->ir );
+
+	return( 0 );
+}
+
+/* Convolution start function.
+ */
+static void *
+vips_convf_start( VipsImage *out, void *a, void *b )
+{
+	VipsImage *in = (VipsImage *) a;
+	VipsConvf *convf = (VipsConvf *) b;
+	VipsConvfSequence *seq;
+
+	if( !(seq = VIPS_NEW( out, VipsConvfSequence )) )
+		return( NULL );
+
+	seq->convf = convf;
+	seq->ir = NULL;
+	seq->pts = NULL;
+	seq->last_bpl = -1;
+
+	seq->ir = vips_region_new( in );
+	if( !(seq->offsets = VIPS_ARRAY( out, convf->nnz, int )) ||
+		!(seq->pts = VIPS_ARRAY( out, convf->nnz, VipsPel * )) ) {
+		vips_convf_stop( seq, in, convf );
+		return( NULL );
+	}
+
+	return( (void *) seq );
+}
+
+#define CONV_FLOAT( ITYPE, OTYPE ) { \
+	ITYPE ** restrict p = (ITYPE **) seq->pts; \
+	OTYPE * restrict q = (OTYPE *) VIPS_REGION_ADDR( or, le, y ); \
+	\
+	for( x = 0; x < sz; x++ ) {  \
+		double sum; \
+		int i; \
+		\
+		sum = 0; \
+		for ( i = 0; i < nnz; i++ ) \
+			sum += t[i] * p[i][x]; \
+ 		\
+		sum = (sum / scale) + offset; \
+		\
+		q[x] = sum;  \
+	}  \
+} 
+
+/* Convolve!
+ */
+static int
+vips_convf_gen( REGION *or, void *vseq, void *a, void *b, gboolean *stop )
+{
+	VipsConvfSequence *seq = (VipsConvfSequence *) vseq;
+	VipsConvf *convf = (VipsConvf *) b;
+	VipsConvolution *convolution = (VipsConvolution *) convf;
+	VipsImage *M = convolution->M;
+	double scale = vips_image_get_scale( M ); 
+	double offset = vips_image_get_offset( M ); 
+	VipsImage *in = (VipsImage *) a;
+	VipsRegion *ir = seq->ir;
+	double * restrict t = convf->coeff; 
+	const int nnz = convf->nnz;
+	VipsRect *r = &or->valid;
+	int le = r->left;
+	int to = r->top;
+	int bo = VIPS_RECT_BOTTOM( r );
+	int sz = VIPS_REGION_N_ELEMENTS( or ) * 
+		(vips_band_format_iscomplex( in->BandFmt ) ? 2 : 1);
+
+	VipsRect s;
+	int x, y, z, i;
+
+	/* Prepare the section of the input image we need. A little larger
+	 * than the section of the output image we are producing.
+	 */
+	s = *r;
+	s.width += M->Xsize - 1;
+	s.height += M->Ysize - 1;
+	if( vips_region_prepare( ir, &s ) )
+		return( -1 );
+
+        /* Fill offset array. Only do this if the bpl has changed since the 
+	 * previous vips_region_prepare().
+	 */
+	if( seq->last_bpl != VIPS_REGION_LSKIP( ir ) ) {
+		seq->last_bpl = VIPS_REGION_LSKIP( ir );
+
+		for( i = 0; i < nnz; i++ ) {
+			z = convf->coeff_pos[i];
+			x = z % M->Xsize;
+			y = z / M->Xsize;
+
+			seq->offsets[i] = 
+				VIPS_REGION_ADDR( ir, x + le, y + to ) -
+				VIPS_REGION_ADDR( ir, le, to );
+		}
+	}
+
+	for( y = to; y < bo; y++ ) { 
+		/* Init pts for this line of PELs.
+		 */
+		for( z = 0; z < nnz; z++ )
+			seq->pts[z] = seq->offsets[z] +
+				VIPS_REGION_ADDR( ir, le, y ); 
+
+		switch( in->BandFmt ) {
+		case VIPS_FORMAT_UCHAR: 	
+			CONV_FLOAT( unsigned char, float ); 
+			break;
+
+		case VIPS_FORMAT_CHAR:   
+			CONV_FLOAT( signed char, float ); 
+			break;
+
+		case VIPS_FORMAT_USHORT: 
+			CONV_FLOAT( unsigned short, float ); 
+			break;
+
+		case VIPS_FORMAT_SHORT:  
+			CONV_FLOAT( signed short, float ); 
+			break;
+
+		case VIPS_FORMAT_UINT:   
+			CONV_FLOAT( unsigned int, float ); 
+			break;
+
+		case VIPS_FORMAT_INT:    
+			CONV_FLOAT( signed int, float ); 
+			break;
+
+		case VIPS_FORMAT_FLOAT:  
+		case VIPS_FORMAT_COMPLEX:  
+			CONV_FLOAT( float, float ); 
+			break;
+
+		case VIPS_FORMAT_DOUBLE: 
+		case VIPS_FORMAT_DPCOMPLEX:  
+			CONV_FLOAT( double, double ); 
+			break;
+
+		default:
+			g_assert_not_reached();
+		}
+	}
+
+	return( 0 );
+}
+
+static int
+vips_convf_build( VipsObject *object )
+{
+	VipsConvolution *convolution = (VipsConvolution *) object;
+	VipsConvf *convf = (VipsConvf *) object;
+	VipsImage **t = (VipsImage **) vips_object_local_array( object, 4 );
+
+	VipsImage *in;
+	VipsImage *M;
+	double *coeff;
+	int ne;
+        int i;
+
+	if( VIPS_OBJECT_CLASS( vips_convf_parent_class )->build( object ) )
+		return( -1 );
+
+	M = convolution->M;
+	coeff = (double *) VIPS_IMAGE_ADDR( M, 0, 0 );
+	ne = M->Xsize * M->Ysize;
+        if( !(convf->coeff = VIPS_ARRAY( object, ne, double )) ||
+        	!(convf->coeff_pos = VIPS_ARRAY( object, ne, int )) )
+                return( -1 );
+
+        /* Find non-zero mask elements.
+         */
+        for( i = 0; i < ne; i++ )
+                if( coeff[i] ) {
+			convf->coeff[convf->nnz] = coeff[i];
+			convf->coeff_pos[convf->nnz] = i;
+			convf->nnz += 1;
+		}
+
+	/* Was the whole mask zero? We must have at least 1 element in there:
+	 * set it to zero.
+	 */
+	if( convf->nnz == 0 ) {
+		convf->coeff[0] = 0;
+		convf->coeff_pos[0] = 0;
+		convf->nnz = 1;
+	}
+
+	in = convolution->in;
+
+	if( vips_embed( in, &t[0], 
+		M->Xsize / 2, M->Ysize / 2, 
+		in->Xsize + M->Xsize - 1, in->Ysize + M->Ysize - 1,
+		"extend", VIPS_EXTEND_COPY,
+		NULL ) )
+		return( -1 );
+	in = t[0]; 
+
+	g_object_set( convf, "out", vips_image_new(), NULL ); 
+	if( vips_image_pipelinev( convolution->out, 
+		VIPS_DEMAND_STYLE_SMALLTILE, in, NULL ) )
+		return( -1 );
+
+	convolution->out->Xoffset = 0;
+	convolution->out->Yoffset = 0;
+
+	/* Prepare output. Consider a 7x7 mask and a 7x7 image --- the output
+	 * would be 1x1.
+	 */
+	if( vips_bandfmt_isint( in->BandFmt ) ) 
+		convolution->out->BandFmt = IM_BANDFMT_FLOAT;
+	convolution->out->Xsize -= M->Xsize - 1;
+	convolution->out->Ysize -= M->Ysize - 1;
+
+	if( vips_image_generate( convolution->out, 
+		vips_convf_start, vips_convf_gen, vips_convf_stop, in, convf ) )
+		return( -1 );
+
+	convolution->out->Xoffset = -M->Xsize / 2;
+	convolution->out->Yoffset = -M->Ysize / 2;
+
+	return( 0 );
+}
+
+static void
+vips_convf_class_init( VipsConvfClass *class )
+{
+	VipsObjectClass *object_class = (VipsObjectClass *) class;
+
+	object_class->nickname = "convf";
+	object_class->description = _( "float convolution operation" );
+	object_class->build = vips_convf_build;
+}
+
+static void
+vips_convf_init( VipsConvf *convf )
+{
+        convf->nnz = 0;
+        convf->coeff = NULL;
+        convf->coeff_pos = NULL;
+}
+
+/**
+ * vips_convf:
+ * @in: input image
+ * @out: output image
+ * @mask: convolve with this mask
+ * @...: %NULL-terminated list of optional named arguments
+ *
+ * Convolution. This is a low-level operation, see vips_conv() for something
+ * more convenient. 
+ *
+ * Perform a convolution of @in with @mask.
+ * Each output pixel is
+ * calculated as sigma[i]{pixel[i] * mask[i]} / scale + offset, where scale
+ * and offset are part of @mask. 
+ *
+ * The convolution is performed with floating-point arithmetic. The output image 
+ * is always #VIPS_FORMAT_FLOAT unless @in is #VIPS_FORMAT_DOUBLE, in which case
+ * @out is also #VIPS_FORMAT_DOUBLE. 
+ *
+ * See also: vips_conv().
+ *
+ * Returns: 0 on success, -1 on error
+ */
+int 
+vips_convf( VipsImage *in, VipsImage **out, VipsImage *mask, ... )
+{
+	va_list ap;
+	int result;
+
+	va_start( ap, mask );
+	result = vips_call_split( "convf", ap, in, out, mask );
+	va_end( ap );
+
+	return( result );
+}
+
diff --git a/libvips/convolution/convi.c b/libvips/convolution/convi.c
new file mode 100644
index 00000000..85817b5e
--- /dev/null
+++ b/libvips/convolution/convi.c
@@ -0,0 +1,1066 @@
+/* convi
+ *
+ * Copyright: 1990, N. Dessipris.
+ *
+ * Author: Nicos Dessipris & Kirk Martinez
+ * Written on: 29/04/1991
+ * Modified on: 19/05/1991
+ * 8/7/93 JC
+ *      - adapted for partial v2
+ *      - memory leaks fixed
+ *      - ANSIfied
+ * 23/7/93 JC
+ *	- inner loop unrolled with a switch - 25% speed-up!
+ * 13/12/93 JC
+ *	- tiny rounding error removed
+ * 7/10/94 JC
+ *	- new IM_ARRAY() macro
+ *	- various simplifications
+ *	- evalend callback added
+ * 1/2/95 JC
+ *	- use of IM_REGION_ADDR() updated
+ *	- output size was incorrect! see comment below
+ *	- bug with large non-square matricies fixed too
+ *	- uses new im_embed() function
+ * 13/7/98 JC
+ *	- wierd bug ... im_free_imask is no longer directly called for close
+ *	  callback, caused SIGKILL on solaris 2.6 ... linker bug?
+ * 9/3/01 JC
+ *	- reworked and simplified, about 10% faster
+ *	- slightly better range clipping
+ * 27/7/01 JC
+ *	- reject masks with scale == 0
+ * 7/4/04 
+ *	- im_conv() now uses im_embed() with edge stretching on the input, not
+ *	  the output
+ *	- sets Xoffset / Yoffset
+ * 11/11/05
+ * 	- simpler inner loop avoids gcc4 bug 
+ * 7/11/07
+ * 	- new evalstart/end callbacks
+ * 12/5/08
+ * 	- int rounding was +1 too much, argh
+ * 	- only rebuild the buffer offsets if bpl changes
+ * 5/4/09
+ * 	- tiny speedups and cleanups
+ * 	- add restrict, though it doesn't seem to help gcc
+ * 12/11/09
+ * 	- only check for non-zero elements once
+ * 	- add mask-all-zero check
+ * 	- cleanups
+ * 3/2/10
+ * 	- gtkdoc
+ * 	- more cleanups
+ * 23/08/10
+ * 	- add a special case for 3x3 masks, about 20% faster
+ * 1/10/10
+ * 	- support complex (just double the bands)
+ * 18/10/10
+ * 	- add experimental Orc path
+ * 29/10/10
+ * 	- use VipsVector
+ * 	- get rid of im_convsep(), just call this twice, no longer worth
+ * 	  keeping two versions
+ * 8/11/10
+ * 	- add array tiling
+ * 9/5/11
+ * 	- argh typo in overflow estimation could cause errors
+ * 15/10/11 Nicolas
+ * 	- handle offset correctly in seperable convolutions
+ * 26/1/16 Lovell Fuller
+ * 	- remove Duff for a 25% speedup
+ * 23/6/16
+ * 	- rewritten as a class
+ * 	- new fixed-point vector path, up to 2x faster
+ */
+
+/*
+
+    This file is part of VIPS.
+    
+    VIPS is free software; you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+    02110-1301  USA
+
+ */
+
+/*
+
+    These files are distributed with VIPS - http://www.vips.ecs.soton.ac.uk
+
+ */
+
+/* 
+#define DEBUG
+#define DEBUG_PIXELS
+#define DEBUG_COMPILE
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif /*HAVE_CONFIG_H*/
+#include <vips/intl.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <limits.h>
+
+#include <vips/vips.h>
+
+#include "pconvolution.h"
+
+/* We do the 8-bit vector path with fixed-point arithmetic. We use 2.6 bits
+ * for the mask coefficients, so our range is -2 to +1.99, after using scale
+ * on the mask.
+ */
+#define FIXED_BITS (6)
+#define FIXED_SCALE (1 << FIXED_BITS)
+
+/* Larger than this and we fall back to C.
+ */
+#define MAX_PASS (20)
+
+/* A pass with a vector. 
+ */
+typedef struct {
+	int first;		/* The index of the first mask coff we use */
+	int last;		/* The index of the last mask coff we use */
+
+	int r;			/* Set previous result in this var */
+
+        /* The code we generate for this section of the mask. 
+	 */
+        VipsVector *vector;
+} Pass;
+
+typedef struct {
+	VipsConvolution parent_instance;
+
+	/* An int version of M.
+	 */
+	VipsImage *iM;
+
+	/* We make a smaller version of the mask with the zeros squeezed out.
+	 */
+	int nnz;		/* Number of non-zero mask elements */
+	int *coeff;		/* Array of non-zero mask coefficients */
+	int *coeff_pos;		/* Index of each nnz element in mask->coeff */
+
+	/* And a fixed-point version for a vector path.
+	 */
+	int *fixed;
+	int n_point;		/* Number of points in fixed-point array */
+
+	/* The set of passes we need for this mask.
+	 */
+	int n_pass;	
+	Pass pass[MAX_PASS];
+
+	/* Code for the final clip back to 8 bits.
+	 */
+	int r;			
+        VipsVector *vector;
+} VipsConvi;
+
+typedef VipsConvolutionClass VipsConviClass;
+
+G_DEFINE_TYPE( VipsConvi, vips_convi, VIPS_TYPE_CONVOLUTION );
+
+/* Our sequence value.
+ */
+typedef struct {
+	VipsConvi *convi;
+	VipsRegion *ir;		/* Input region */
+
+	int *offsets;		/* Offsets for each non-zero matrix element */
+	VipsPel **pts;		/* Per-non-zero mask element image pointers */
+
+	int last_bpl;		/* Avoid recalcing offsets, if we can */
+
+	/* We need a pair of intermediate buffers to keep the results of each
+	 * vector conv pass. 
+	 */
+	short *t1;
+	short *t2;
+} VipsConviSequence;
+
+/* Free a sequence value.
+ */
+static int
+vips_convi_stop( void *vseq, void *a, void *b )
+{
+	VipsConviSequence *seq = (VipsConviSequence *) vseq;
+
+	VIPS_UNREF( seq->ir );
+	VIPS_FREE( seq->offsets );
+	VIPS_FREE( seq->pts );
+	VIPS_FREE( seq->t1 );
+	VIPS_FREE( seq->t2 );
+
+	return( 0 );
+}
+
+/* Convolution start function.
+ */
+static void *
+vips_convi_start( VipsImage *out, void *a, void *b )
+{
+	VipsImage *in = (VipsImage *) a;
+	VipsConvi *convi = (VipsConvi *) b;
+	VipsConviSequence *seq;
+
+	if( !(seq = VIPS_NEW( out, VipsConviSequence )) )
+		return( NULL );
+
+	seq->convi = convi;
+	seq->ir = NULL;
+	seq->offsets = NULL;
+	seq->pts = NULL;
+	seq->last_bpl = -1;
+	seq->t1 = NULL;
+	seq->t2 = NULL;
+
+	seq->ir = vips_region_new( in );
+
+	/* C mode.
+	 */
+	if( convi->nnz ) {
+		seq->offsets = VIPS_ARRAY( NULL, convi->nnz, int );
+		seq->pts = VIPS_ARRAY( NULL, convi->nnz, VipsPel * );
+
+		if( !seq->offsets ||
+			!seq->pts ) { 
+			vips_convi_stop( seq, in, convi );
+			return( NULL );
+		}
+	}
+
+	/* Vector mode.
+	 */
+	if( convi->n_pass ) {
+		seq->t1 = VIPS_ARRAY( NULL, VIPS_IMAGE_N_ELEMENTS( in ), short );
+		seq->t2 = VIPS_ARRAY( NULL, VIPS_IMAGE_N_ELEMENTS( in ), short );
+
+		if( !seq->t1 || 
+			!seq->t2 ) {
+			vips_convi_stop( seq, in, convi );
+			return( NULL );
+		}
+	}
+
+	return( (void *) seq );
+}
+
+static void
+vips_convi_compile_free( VipsConvi *convi )
+{
+	int i;
+
+	for( i = 0; i < convi->n_pass; i++ )
+		VIPS_FREEF( vips_vector_free, convi->pass[i].vector );
+	convi->n_pass = 0;
+	VIPS_FREEF( vips_vector_free, convi->vector );
+}
+
+#define TEMP( N, S ) vips_vector_temporary( v, (char *) N, S )
+#define PARAM( N, S ) vips_vector_parameter( v, (char *) N, S )
+#define SCANLINE( N, P, S ) vips_vector_source_scanline( v, (char *) N, P, S )
+#define CONST( N, V, S ) vips_vector_constant( v, (char *) N, V, S )
+#define ASM2( OP, A, B ) vips_vector_asm2( v, (char *) OP, A, B )
+#define ASM3( OP, A, B, C ) vips_vector_asm3( v, (char *) OP, A, B, C )
+
+/* Generate code for a section of the mask. first is the index we start
+ * at, we set last to the index of the last one we use before we run 
+ * out of intermediates / constants / parameters / sources or mask
+ * coefficients.
+ *
+ * 0 for success, -1 on error.
+ */
+static int
+vips_convi_compile_section( VipsConvi *convi, VipsImage *in, Pass *pass )
+{
+	VipsConvolution *convolution = (VipsConvolution *) convi;
+	VipsImage *M = convolution->M;
+
+	VipsVector *v;
+	int i;
+
+#ifdef DEBUG_COMPILE
+	printf( "starting pass %d\n", pass->first ); 
+#endif /*DEBUG_COMPILE*/
+
+	pass->vector = v = vips_vector_new( "convi", 2 );
+
+	/* "r" is the array of sums from the previous pass (if any).
+	 */
+	pass->r = vips_vector_source_name( v, "r", 2 );
+
+	/* The value we fetch from the image, the accumulated sum.
+	 */
+	TEMP( "value", 2 );
+	TEMP( "valueb", 1 );
+	TEMP( "sum", 2 );
+
+	/* Init the sum. If this is the first pass, it's a constant. If this
+	 * is a later pass, we have to init the sum from the result 
+	 * of the previous pass. 
+	 */
+	if( pass->first == 0 ) {
+		char c0[256];
+
+		CONST( c0, 0, 2 );
+		ASM2( "loadpw", "sum", c0 );
+	}
+	else 
+		ASM2( "loadw", "sum", "r" );
+
+	for( i = pass->first; i < convi->n_point; i++ ) {
+		int x = i % M->Xsize;
+		int y = i / M->Xsize;
+
+		char source[256];
+		char off[256];
+		char coeff[256];
+
+		/* Exclude zero elements.
+		 */
+		if( !convi->fixed[i] )
+			continue;
+
+		/* The source. sl0 is the first scanline in the mask.
+		 */
+		SCANLINE( source, y, 1 );
+
+		/* Load with an offset. Only for non-first-columns though.
+		 */
+		if( x == 0 ) 
+			ASM2( "convubw", "value", source );
+		else {
+			CONST( off, in->Bands * x, 1 );
+			ASM3( "loadoffb", "valueb", source, off );
+			ASM2( "convubw", "value", "valueb" );
+		}
+
+		/* Mask coefficients are 2.6 bits fixed point, so -2 to +1.99.
+		 *
+		 * We need a signed multiply, so the image pixel needs to
+		 * become a signed 16-bit value. We know only the bottom 8 bits
+		 * of the image and coefficient are interesting, so we can take
+		 * the bottom half of a 16x16->32 multiply. 
+		 */
+		CONST( coeff, convi->fixed[i], 2 );
+		ASM3( "mullw", "value", "value", coeff );
+
+		/* We accumulate the signed 16-bit result in sum. Saturated
+		 * add. 
+		 */
+		ASM3( "addssw", "sum", "sum", "value" );
+
+		if( vips_vector_full( v ) )
+			break;
+	}
+
+	pass->last = i;
+
+	/* And write to our intermediate buffer.
+	 */
+	ASM2( "copyw", "d1", "sum" );
+
+	if( !vips_vector_compile( v ) ) 
+		return( -1 );
+
+#ifdef DEBUG_COMPILE
+	printf( "done coeffs %d to %d\n", pass->first, pass->last );
+	vips_vector_print( v );
+#endif /*DEBUG_COMPILE*/
+
+	return( 0 );
+}
+
+/* Generate code for the final 16->8 conversion. 
+ *
+ * 0 for success, -1 on error.
+ */
+static int
+vips_convi_compile_clip( VipsConvi *convi )
+{
+	VipsConvolution *convolution = (VipsConvolution *) convi;
+	VipsImage *M = convolution->M;
+	int offset = VIPS_RINT( vips_image_get_offset( M ) );
+
+	VipsVector *v;
+	char c32[256];
+	char c6[256];
+	char c0[256];
+	char c255[256];
+	char off[256];
+
+	convi->vector = v = vips_vector_new( "convi", 1 );
+
+	/* "r" is the array of sums we clip down. 
+	 */
+	convi->r = vips_vector_source_name( v, "r", 2 );
+
+	/* The value we fetch from the image.
+	 */
+	TEMP( "value", 2 );
+
+	CONST( c32, 32, 2 );
+	ASM3( "addw", "value", "r", c32 );
+	CONST( c6, 6, 2 );
+	ASM3( "shrsw", "value", "value", c6 );
+
+	CONST( off, offset, 2 ); 
+	ASM3( "addw", "value", "value", off );
+
+	/* You'd think "convsuswb" (convert signed 16-bit to unsigned
+	 * 8-bit with saturation) would be quicker, but it's a lot
+	 * slower.
+	 */
+	CONST( c0, 0, 2 );
+	ASM3( "maxsw", "value", c0, "value" ); 
+	CONST( c255, 255, 2 );
+	ASM3( "minsw", "value", c255, "value" ); 
+
+	ASM2( "convwb", "d1", "value" );
+
+	if( !vips_vector_compile( v ) ) 
+		return( -1 );
+
+	return( 0 );
+}
+
+static int
+vips_convi_compile( VipsConvi *convi, VipsImage *in )
+{
+	int i;
+	Pass *pass;
+
+	/* Generate passes until we've used up the whole mask.
+	 */
+	for( i = 0;; ) {
+		/* Allocate space for another pass.
+		 */
+		if( convi->n_pass == MAX_PASS ) 
+			return( -1 );
+		pass = &convi->pass[convi->n_pass];
+		convi->n_pass += 1;
+
+		pass->first = i;
+		pass->r = -1;
+
+		if( vips_convi_compile_section( convi, in, pass ) )
+			return( -1 );
+		i = pass->last + 1;
+
+		if( i >= convi->n_point )
+			break;
+	}
+
+	if( vips_convi_compile_clip( convi ) )
+		return( -1 );
+
+	return( 0 );
+}
+
+static int
+vips_convi_generate_vector( VipsRegion *or, 
+	void *vseq, void *a, void *b, gboolean *stop )
+{
+	VipsConviSequence *seq = (VipsConviSequence *) vseq;
+	VipsConvi *convi = (VipsConvi *) b;
+	VipsConvolution *convolution = (VipsConvolution *) convi;
+	VipsImage *M = convolution->M;
+	VipsImage *in = (VipsImage *) a;
+	VipsRegion *ir = seq->ir;
+	VipsRect *r = &or->valid;
+	int ne = r->width * in->Bands;
+
+	VipsRect s;
+	int i, y;
+	VipsExecutor executor[MAX_PASS];
+	VipsExecutor clip;
+
+#ifdef DEBUG_PIXELS
+	printf( "vips_convi_generate_vector: generating %d x %d at %d x %d\n",
+		r->width, r->height, r->left, r->top ); 
+#endif /*DEBUG_PIXELS*/
+
+	/* Prepare the section of the input image we need. A little larger
+	 * than the section of the output image we are producing.
+	 */
+	s = *r;
+	s.width += M->Xsize - 1;
+	s.height += M->Ysize - 1;
+	if( vips_region_prepare( ir, &s ) )
+		return( -1 );
+
+	for( i = 0; i < convi->n_pass; i++ ) 
+		vips_executor_set_program( &executor[i], 
+			convi->pass[i].vector, ne );
+	vips_executor_set_program( &clip, convi->vector, ne );
+
+	VIPS_GATE_START( "vips_convi_generate_vector: work" ); 
+
+	for( y = 0; y < r->height; y ++ ) { 
+		VipsPel *q = VIPS_REGION_ADDR( or, r->left, r->top + y );
+	
+#ifdef DEBUG_PIXELS
+{
+		int h, v;
+
+		printf( "before convolve: x = %d, y = %d\n", 
+			r->left, r->top + y );
+		for( v = 0; v < M->Ysize; v++ ) {
+			for( h = 0; h < M->Xsize; h++ )
+				printf( "%3d ", *VIPS_REGION_ADDR( ir, 
+					r->left + h, r->top + y + v ) );
+			printf( "\n" );
+		}
+}
+#endif /*DEBUG_PIXELS*/
+
+		/* We run our n passes to generate this scanline.
+		 */
+		for( i = 0; i < convi->n_pass; i++ ) {
+			Pass *pass = &convi->pass[i]; 
+
+			vips_executor_set_scanline( &executor[i], 
+				ir, r->left, r->top + y );
+			vips_executor_set_array( &executor[i],
+				pass->r, seq->t1 );
+			vips_executor_set_destination( &executor[i], seq->t2 );
+			vips_executor_run( &executor[i] );
+
+			VIPS_SWAP( signed short *, seq->t1, seq->t2 );
+		}
+
+#ifdef DEBUG_PIXELS
+		printf( "before clip: %d\n", ((signed short *) seq->t1)[0] );
+#endif /*DEBUG_PIXELS*/
+
+		vips_executor_set_array( &clip, convi->r, seq->t1 );
+		vips_executor_set_destination( &clip, q ); 
+		vips_executor_run( &clip );
+
+#ifdef DEBUG_PIXELS
+		printf( "after clip: %d\n", 
+			*VIPS_REGION_ADDR( or, r->left, r->top + y ) );
+#endif /*DEBUG_PIXELS*/
+	}
+
+	VIPS_GATE_STOP( "vips_convi_generate_vector: work" ); 
+
+	VIPS_COUNT_PIXELS( or, "vips_convi_generate_vector" ); 
+
+	return( 0 );
+}
+
+/* INT inner loops.
+ */
+#define CONV_INT( TYPE, CLIP ) { \
+	TYPE ** restrict p = (TYPE **) seq->pts; \
+	TYPE * restrict q = (TYPE *) VIPS_REGION_ADDR( or, le, y ); \
+	\
+	for( x = 0; x < sz; x++ ) {  \
+		int sum; \
+		int i; \
+		\
+		sum = 0; \
+		for ( i = 0; i < nnz; i++ ) \
+			sum += t[i] * p[i][x]; \
+		\
+		sum = ((sum + rounding) / scale) + offset; \
+		\
+		CLIP; \
+		\
+		q[x] = sum;  \
+	}  \
+} 
+
+/* FLOAT inner loops.
+ */
+#define CONV_FLOAT( TYPE ) { \
+	TYPE ** restrict p = (TYPE **) seq->pts; \
+	TYPE * restrict q = (TYPE *) VIPS_REGION_ADDR( or, le, y ); \
+	\
+	for( x = 0; x < sz; x++ ) {  \
+		double sum; \
+		int i; \
+		\
+		sum = 0; \
+		for ( i = 0; i < nnz; i++ ) \
+			sum += t[i] * p[i][x]; \
+ 		\
+		sum = (sum / scale) + offset; \
+		\
+		q[x] = sum;  \
+	}  \
+} 
+
+/* Various integer range clips. Record over/under flows.
+ */
+#define CLIP_UCHAR( V ) \
+G_STMT_START { \
+	if( (V) < 0 ) \
+		(V) = 0; \
+	else if( (V) > UCHAR_MAX ) \
+		(V) = UCHAR_MAX; \
+} G_STMT_END
+
+#define CLIP_CHAR( V ) \
+G_STMT_START { \
+	if( (V) < SCHAR_MIN ) \
+		(V) = SCHAR_MIN; \
+	else if( (V) > SCHAR_MAX ) \
+		(V) = SCHAR_MAX; \
+} G_STMT_END
+
+#define CLIP_USHORT( V ) \
+G_STMT_START { \
+	if( (V) < 0 ) \
+		(V) = 0; \
+	else if( (V) > USHRT_MAX ) \
+		(V) = USHRT_MAX; \
+} G_STMT_END
+
+#define CLIP_SHORT( V ) \
+G_STMT_START { \
+	if( (V) < SHRT_MIN ) \
+		(V) = SHRT_MIN; \
+	else if( (V) > SHRT_MAX ) \
+		(V) = SHRT_MAX; \
+} G_STMT_END
+
+#define CLIP_NONE( V ) {}
+
+/* Convolve!
+ */
+static int
+vips_convi_generate( VipsRegion *or, 
+	void *vseq, void *a, void *b, gboolean *stop )
+{
+	VipsConviSequence *seq = (VipsConviSequence *) vseq;
+	VipsConvi *convi = (VipsConvi *) b;
+	VipsConvolution *convolution = (VipsConvolution *) convi;
+	VipsImage *M = convolution->M;
+	int scale = VIPS_RINT( vips_image_get_scale( M ) ); 
+	int rounding = scale / 2;
+	int offset = VIPS_RINT( vips_image_get_offset( M ) ); 
+	VipsImage *in = (VipsImage *) a;
+	VipsRegion *ir = seq->ir;
+	int * restrict t = convi->coeff; 
+	const int nnz = convi->nnz;
+	VipsRect *r = &or->valid;
+	int le = r->left;
+	int to = r->top;
+	int bo = VIPS_RECT_BOTTOM( r );
+	int sz = VIPS_REGION_N_ELEMENTS( or ) * 
+		(vips_band_format_iscomplex( in->BandFmt ) ? 2 : 1);
+
+	VipsRect s;
+	int x, y, z, i;
+
+	/* Prepare the section of the input image we need. A little larger
+	 * than the section of the output image we are producing.
+	 */
+	s = *r;
+	s.width += M->Xsize - 1;
+	s.height += M->Ysize - 1;
+	if( vips_region_prepare( ir, &s ) )
+		return( -1 );
+
+        /* Fill offset array. Only do this if the bpl has changed since the 
+	 * previous vips_region_prepare().
+	 */
+	if( seq->last_bpl != VIPS_REGION_LSKIP( ir ) ) {
+		seq->last_bpl = VIPS_REGION_LSKIP( ir );
+
+		for( i = 0; i < nnz; i++ ) {
+			z = convi->coeff_pos[i];
+			x = z % M->Xsize;
+			y = z / M->Xsize;
+
+			seq->offsets[i] = 
+				VIPS_REGION_ADDR( ir, x + le, y + to ) -
+				VIPS_REGION_ADDR( ir, le, to );
+		}
+	}
+
+	for( y = to; y < bo; y++ ) { 
+		/* Init pts for this line of PELs.
+		 */
+		for( z = 0; z < nnz; z++ )
+			seq->pts[z] = seq->offsets[z] +
+				VIPS_REGION_ADDR( ir, le, y ); 
+
+		switch( in->BandFmt ) {
+		case VIPS_FORMAT_UCHAR: 	
+			CONV_INT( unsigned char, CLIP_UCHAR( sum ) ); 
+			break;
+
+		case VIPS_FORMAT_CHAR:   
+			CONV_INT( signed char, CLIP_CHAR( sum ) ); 
+			break;
+
+		case VIPS_FORMAT_USHORT: 
+			CONV_INT( unsigned short, CLIP_USHORT( sum ) ); 
+			break;
+
+		case VIPS_FORMAT_SHORT:  
+			CONV_INT( signed short, CLIP_SHORT( sum ) ); 
+			break;
+
+		case VIPS_FORMAT_UINT:   
+			CONV_INT( unsigned int, CLIP_NONE( sum ) ); 
+			break;
+
+		case VIPS_FORMAT_INT:    
+			CONV_INT( signed int, CLIP_NONE( sum ) ); 
+			break;
+
+		case VIPS_FORMAT_FLOAT:  
+		case VIPS_FORMAT_COMPLEX:  
+			CONV_FLOAT( float ); 
+			break;
+
+		case VIPS_FORMAT_DOUBLE: 
+		case VIPS_FORMAT_DPCOMPLEX:  
+			CONV_FLOAT( double ); 
+			break;
+
+		default:
+			g_assert_not_reached();
+		}
+	}
+
+	return( 0 );
+}
+
+/* Make an int version of a mask.
+ *
+ * We rint() everything, then adjust the scale try to match the overall
+ * effect.
+ */
+int
+vips__image_intize( VipsImage *in, VipsImage **out )
+{
+	VipsImage *t;
+	int x, y;
+	double double_result;
+	double out_scale;
+	double out_offset;
+	int int_result;
+
+	if( vips_check_matrix( "vips2imask", in, &t ) )
+		return( -1 ); 
+	if( !(*out = vips_image_new_matrix( t->Xsize, t->Ysize )) ) {
+		g_object_unref( t ); 
+		return( -1 ); 
+	}
+
+	/* We want to make an intmask which has the same input to output ratio
+	 * as the double image.
+	 *
+	 * Imagine convolving with the double image, what's the ratio of
+	 * brightness between input and output? We want the same ratio for the
+	 * int version, if we can.
+	 *
+	 * Imagine an input image where every pixel is 1, what will the output
+	 * be?
+	 */
+	double_result = 0;
+	for( y = 0; y < t->Ysize; y++ )
+		for( x = 0; x < t->Xsize; x++ )
+			double_result += *VIPS_MATRIX( t, x, y ); 
+	double_result /= vips_image_get_scale( t );
+
+	for( y = 0; y < t->Ysize; y++ )
+		for( x = 0; x < t->Xsize; x++ )
+			*VIPS_MATRIX( *out, x, y ) = 
+				VIPS_RINT( *VIPS_MATRIX( t, x, y ) );
+
+	out_scale = VIPS_RINT( vips_image_get_scale( t ) );
+	if( out_scale == 0 )
+		out_scale = 1;
+	out_offset = VIPS_RINT( vips_image_get_offset( t ) );
+
+	/* Now convolve a 1 everywhere image with the int version we've made,
+	 * what do we get?
+	 */
+	int_result = 0;
+	for( y = 0; y < t->Ysize; y++ )
+		for( x = 0; x < t->Xsize; x++ )
+			int_result += *VIPS_MATRIX( *out, x, y ); 
+	int_result /= out_scale;
+
+	/* And adjust the scale to get as close to a match as we can. 
+	 */
+	out_scale = VIPS_RINT( out_scale + (int_result - double_result) );
+	if( out_scale == 0 ) 
+		out_scale = 1;
+
+	vips_image_set_double( *out, "scale", out_scale );
+	vips_image_set_double( *out, "offset", out_offset );
+
+	g_object_unref( t ); 
+
+	return( 0 );
+}
+
+/* Make an int version of a mask.
+ *
+ * This makes a fixed-point version ready for the vector path: instead of an 
+ * output scale, we have x.y for each element.
+ *
+ * @out is a w x h int array.
+ */
+static int
+intize_to_fixed_point( VipsImage *in, int *out )
+{
+	VipsImage *t;
+	double scale;
+	int ne;
+	double *scaled;
+	double total_error;
+	int i;
+
+	if( vips_check_matrix( "vips2imask", in, &t ) )
+		return( -1 ); 
+
+	/* Bake the scale into the mask.
+	 */
+	scale = vips_image_get_scale( t );
+	ne = t->Xsize * t->Ysize; 
+        if( !(scaled = VIPS_ARRAY( in, ne, double )) ) {
+		g_object_unref( t ); 
+		return( -1 );
+	}
+	for( i = 0; i < ne; i++ ) 
+		scaled[i] = VIPS_MATRIX( t, 0, 0 )[i] / scale;
+	g_object_unref( t ); 
+
+	/* The scaled mask must fit in 2.6 bits, so we can handle -2 to +1.99
+	 */
+	for( i = 0; i < ne; i++ ) 
+		if( scaled[i] >= 2.0 ||
+			scaled[i] < -2 ) {
+#ifdef DEBUG_COMPILE
+			printf( "intize_to_fixed_point: out of range\n" );
+#endif /*DEBUG_COMPILE*/
+
+			return( -1 ); 
+		}
+
+	/* The smallest coefficient we can manage is 1/64th, we'll just turn
+	 * that into zero.
+	 *
+	 * Find the total error we'll get by rounding down to zero and bail if
+	 * it's significant.
+	 */
+	total_error = 0.0;
+	for( i = 0; i < ne; i++ ) 
+		if( scaled[i] != 0.0 &&
+			VIPS_FABS( scaled[i] ) < 1.0 / FIXED_SCALE ) 
+			total_error += VIPS_FABS( scaled[i] ); 
+
+	/* 0.1 is a 10% error.
+	 */
+	if( total_error > 0.1 ) {
+#ifdef DEBUG_COMPILE
+		printf( "intize_to_fixed_point: too many underflows\n" );
+#endif /*DEBUG_COMPILE*/
+
+		return( -1 ); 
+	}
+
+	vips_vector_to_fixed_point( scaled, out, ne, FIXED_SCALE );
+
+#ifdef DEBUG_COMPILE
+{
+	int x, y;
+
+	printf( "intize_to_fixed_point:\n" ); 
+	for( y = 0; y < t->Ysize; y++ ) {
+		printf( "\t" ); 
+		for( x = 0; x < t->Xsize; x++ ) 
+			printf( "%4d ", out[y * t->Xsize + x] ); 
+		printf( "\n" ); 
+	}
+}
+#endif /*DEBUG_COMPILE*/
+
+	return( 0 );
+}
+
+static int
+vips_convi_build( VipsObject *object )
+{
+	VipsObjectClass *class = VIPS_OBJECT_GET_CLASS( object );
+	VipsConvolution *convolution = (VipsConvolution *) object;
+	VipsConvi *convi = (VipsConvi *) object;
+	VipsImage **t = (VipsImage **) vips_object_local_array( object, 4 );
+
+	VipsImage *in;
+	VipsImage *M;
+	int n_point;
+	VipsGenerateFn generate;
+	double *coeff;
+        int i;
+
+	if( VIPS_OBJECT_CLASS( vips_convi_parent_class )->build( object ) )
+		return( -1 );
+
+	in = convolution->in;
+	M = convolution->M;
+	convi->n_point = n_point = M->Xsize * M->Ysize;
+
+	if( vips_embed( in, &t[0], 
+		M->Xsize / 2, M->Ysize / 2, 
+		in->Xsize + M->Xsize - 1, in->Ysize + M->Ysize - 1,
+		"extend", VIPS_EXTEND_COPY,
+		NULL ) )
+		return( -1 );
+	in = t[0]; 
+
+	/* For uchar input, try to make a vector path.
+	 */
+	if( vips_vector_isenabled() &&
+		in->BandFmt == VIPS_FORMAT_UCHAR ) {
+		if( (convi->fixed = VIPS_ARRAY( object, n_point, int )) &&
+			!intize_to_fixed_point( M, convi->fixed ) &&
+			!vips_convi_compile( convi, in ) ) {
+			generate = vips_convi_generate_vector;
+			vips_info( class->nickname, "using vector path" ); 
+		}
+		else
+			vips_convi_compile_free( convi );
+	}
+
+	/* If there's no vector path, fall back to C.
+	 */
+	if( !convi->n_pass ) {
+		/* Make an int version of our mask.
+		 */
+		if( vips__image_intize( M, &t[1] ) )
+			return( -1 ); 
+		convi->iM = M = t[1];
+
+		coeff = VIPS_MATRIX( M, 0, 0 ); 
+		if( !(convi->coeff = VIPS_ARRAY( object, n_point, int )) ||
+			!(convi->coeff_pos = VIPS_ARRAY( object, n_point, int )) )
+			return( -1 );
+
+		/* Squeeze out zero mask elements. 
+		 */
+		for( i = 0; i < n_point; i++ )
+			if( coeff[i] ) {
+				convi->coeff[convi->nnz] = coeff[i];
+				convi->coeff_pos[convi->nnz] = i;
+				convi->nnz += 1;
+			}
+
+		/* Was the whole mask zero? We must have at least 1 element 
+		 * in there: set it to zero.
+		 */
+		if( convi->nnz == 0 ) {
+			convi->coeff[0] = 0;
+			convi->coeff_pos[0] = 0;
+			convi->nnz = 1;
+		}
+
+		generate = vips_convi_generate;
+		vips_info( class->nickname, "using C path" ); 
+	}
+
+	g_object_set( convi, "out", vips_image_new(), NULL ); 
+	if( vips_image_pipelinev( convolution->out, 
+		VIPS_DEMAND_STYLE_SMALLTILE, in, NULL ) )
+		return( -1 );
+
+	/* Prepare output. Consider a 7x7 mask and a 7x7 image --- the output
+	 * would be 1x1.
+	 */
+	convolution->out->Xsize -= M->Xsize - 1;
+	convolution->out->Ysize -= M->Ysize - 1;
+
+	if( vips_image_generate( convolution->out, 
+		vips_convi_start, generate, vips_convi_stop, in, convi ) )
+		return( -1 );
+
+	convolution->out->Xoffset = -M->Xsize / 2;
+	convolution->out->Yoffset = -M->Ysize / 2;
+
+	return( 0 );
+}
+
+static void
+vips_convi_class_init( VipsConviClass *class )
+{
+	VipsObjectClass *object_class = (VipsObjectClass *) class;
+
+	object_class->nickname = "convi";
+	object_class->description = _( "int convolution operation" );
+	object_class->build = vips_convi_build;
+}
+
+static void
+vips_convi_init( VipsConvi *convi )
+{
+        convi->nnz = 0;
+        convi->coeff = NULL;
+        convi->coeff_pos = NULL;
+}
+
+/**
+ * vips_convi:
+ * @in: input image
+ * @out: output image
+ * @mask: convolve with this mask
+ * @...: %NULL-terminated list of optional named arguments
+ *
+ * Integer convolution. This is a low-level operation, see vips_conv() for 
+ * something more convenient. 
+ *
+ * @mask is converted to an integer mask with rint() of each element, rint of
+ * scale and rint of offset. Each output pixel is then calculated as 
+ *
+ * |[
+ * sigma[i]{pixel[i] * mask[i]} / scale + offset
+ * ]|
+ *
+ * The output image always has the same #VipsBandFormat as the input image. 
+ *
+ * For #VIPS_FORMAT_UCHAR images, vips_convi() uses a fast vector path based on
+ * fixed-point arithmetic. This can produce slightly different results. 
+ * Disable the vector path with `--vips-novector` or `VIPS_NOVECTOR` or
+ * vips_vector_set_enabled().
+ *
+ * See also: vips_conv().
+ *
+ * Returns: 0 on success, -1 on error
+ */
+int 
+vips_convi( VipsImage *in, VipsImage **out, VipsImage *mask, ... )
+{
+	va_list ap;
+	int result;
+
+	va_start( ap, mask );
+	result = vips_call_split( "convi", ap, in, out, mask );
+	va_end( ap );
+
+	return( result );
+}
+
diff --git a/libvips/convolution/convolution.c b/libvips/convolution/convolution.c
index a2caf0d9..2650a40c 100644
--- a/libvips/convolution/convolution.c
+++ b/libvips/convolution/convolution.c
@@ -157,16 +157,24 @@ void
 vips_convolution_operation_init( void )
 {
 	extern int vips_conv_get_type( void ); 
-	extern int vips_compass_get_type( void ); 
+	extern int vips_conva_get_type( void ); 
+	extern int vips_convf_get_type( void ); 
+	extern int vips_convi_get_type( void ); 
 	extern int vips_convsep_get_type( void ); 
+	extern int vips_convasep_get_type( void ); 
+	extern int vips_compass_get_type( void ); 
 	extern int vips_fastcor_get_type( void ); 
 	extern int vips_spcor_get_type( void ); 
 	extern int vips_sharpen_get_type( void ); 
 	extern int vips_gaussblur_get_type( void ); 
 
 	vips_conv_get_type(); 
+	vips_conva_get_type(); 
+	vips_convf_get_type(); 
+	vips_convi_get_type(); 
 	vips_compass_get_type(); 
 	vips_convsep_get_type(); 
+	vips_convasep_get_type(); 
 	vips_fastcor_get_type(); 
 	vips_spcor_get_type(); 
 	vips_sharpen_get_type(); 
diff --git a/libvips/convolution/convsep.c b/libvips/convolution/convsep.c
index 7838ec41..d428e734 100644
--- a/libvips/convolution/convsep.c
+++ b/libvips/convolution/convsep.c
@@ -63,6 +63,8 @@ vips_convsep_build( VipsObject *object )
 	VipsImage **t = (VipsImage **) 
 		vips_object_local_array( object, 3 );
 
+	VipsImage *in;
+
 	g_object_set( convsep, "out", vips_image_new(), NULL ); 
 
 	if( VIPS_OBJECT_CLASS( vips_convsep_parent_class )->build( object ) )
@@ -71,20 +73,38 @@ vips_convsep_build( VipsObject *object )
 	if( vips_check_separable( class->nickname, convolution->M ) ) 
                 return( -1 );
 
-	if( vips_rot( convolution->M, &t[0], VIPS_ANGLE_D90, NULL ) ||
-		vips_conv( convolution->in, &t[1], convolution->M, 
-			"precision", convsep->precision,
-			"layers", convsep->layers,
-			"cluster", convsep->cluster,
-			NULL ) ||
-		vips_conv( t[1], &t[2], t[0], 
-			"precision", convsep->precision,
-			"layers", convsep->layers,
-			"cluster", convsep->cluster,
-			NULL ) )
-		return( -1 ); 
+	in = convolution->in;
 
-	if( vips_image_write( t[2], convolution->out ) )
+	if( convsep->precision == VIPS_PRECISION_APPROXIMATE ) {
+		if( vips_convasep( convolution->in, &t[0], convolution->M,
+			"layers", convsep->layers,
+			NULL ) )
+			return( -1 ); 
+		in = t[0];
+	}
+	else { 
+		if( vips_rot( convolution->M, &t[0], VIPS_ANGLE_D90, NULL ) )
+			return( -1 ); 
+
+		/* We must only add the offset once.
+		 */
+		vips_image_set_double( t[0], "offset", 0 );
+
+		if( vips_conv( convolution->in, &t[1], convolution->M, 
+				"precision", convsep->precision,
+				"layers", convsep->layers,
+				"cluster", convsep->cluster,
+				NULL ) ||
+			vips_conv( t[1], &t[2], t[0], 
+				"precision", convsep->precision,
+				"layers", convsep->layers,
+				"cluster", convsep->cluster,
+				NULL ) )
+			return( -1 ); 
+		in = t[2];
+	}
+
+	if( vips_image_write( in, convolution->out ) )
 		return( -1 ); 
 
 	return( 0 );
diff --git a/libvips/convolution/im_aconv.c b/libvips/convolution/im_aconv.c
deleted file mode 100644
index 901f846a..00000000
--- a/libvips/convolution/im_aconv.c
+++ /dev/null
@@ -1,1273 +0,0 @@
-/* im_aconv ... approximate convolution
- *
- * This operation does an approximate convolution. 
- *
- * Author: John Cupitt & Nicolas Robidoux
- * Written on: 31/5/11
- * Modified on: 
- * 31/5/11
- *      - from im_aconvsep()
- */
-
-/*
-
-    This file is part of VIPS.
-    
-    VIPS is free software; you can redistribute it and/or modify
-    it under the terms of the GNU Lesser General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
-    02110-1301  USA
-
- */
-
-/*
-
-    These files are distributed with VIPS - http://www.vips.ecs.soton.ac.uk
-
- */
-
-/*
-
-  See:
-
-	http://incubator.quasimondo.com/processing/stackblur.pde
-
-  This thing is a little like stackblur, but generalised to any 2D 
-  convolution.
-
- */
-
-/*
-
-  TODO
-
-timing:
-
-$ time vips im_conv_f img_0075.jpg x2.v g2d201.con
-real	5m3.359s
-user	9m34.700s
-sys	0m1.500s
-
-$ time vips im_aconv img_0075.jpg x.v g2d201.con 10 10
-real	0m3.151s
-user	0m5.640s
-sys	0m0.100s
-
-$ vips im_subtract x.v x2.v diff.v
-$ vips im_abs diff.v abs.v
-$ vips im_max abs.v
-2.70833
-
-  	- are we handling mask offset correctly?
-
- */
-
-/*
-#define DEBUG
-#define VIPS_DEBUG
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif /*HAVE_CONFIG_H*/
-#include <vips/intl.h>
-
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <limits.h>
-#include <math.h>
-
-#include <vips/vips.h>
-#include <vips/vector.h>
-#include <vips/debug.h>
-
-/* Maximum number of boxes we can break the mask into.
- */
-#define MAX_LINES (10000)
-
-/* The number of edges we consider at once in clustering. Higher values are
- * faster, but risk pushing up average error in the result.
- */
-#define MAX_EDGES (1000)
-
-/* Get an (x,y) value from a mask.
- */
-#define MASK( M, X, Y ) ((M)->coeff[(X) + (Y) * (M)->xsize])
-
-/* A horizontal line in the mask.
- */
-typedef struct _HLine {
-	/* Start is the left-most pixel in the line, end is one beyond the
-	 * right-most pixel. 
-	 */
-	int start;
-	int end;
-
-	/* The hlines have weights. weight 0 means this line is unused.
-	 */
-	int weight;
-} HLine;
-
-/* For clustering. A pair of hlines and their distance. An edge in a graph.
- */
-typedef struct _Edge {
-	/* The index into boxes->hline[].
-	 */
-	int a;
-	int b;
-
-	/* The distance between them, see boxes_distance().
-	 */
-	int d;
-} Edge;
-
-/* An element of a vline.
- */
-typedef struct _VElement {
-	/* band is the index into hline[] we add, row is the row we take 
-	 * it from.
-	 */
-	int band;
-	int row;
-
-	/* Negative lobes are made with factor -1, we also common-up repeated
-	 * additions of the same line.
-	 */
-	int factor;
-} VElement;
-
-/* A vline.
- */
-typedef struct _VLine {
-	int band;
-	int factor;
-	int start;
-	int end;
-} VLine;
-
-/* A set of boxes. 
- */
-typedef struct _Boxes {
-	/* Copy of our arguments.
-	 */
-	IMAGE *in;
-	IMAGE *out;
-	DOUBLEMASK *mask;
-	int n_layers;
-	int cluster;
-
-	int area;
-	int rounding;
-
-	/* The horizontal lines we gather. hline[3] writes to band 3 in the
-	 * intermediate image. max_line is the length of the longest hline:
-	 * over 256 and we need to use an int intermediate for 8-bit images.
-	 */
-	int n_hline;
-	HLine hline[MAX_LINES];
-	int max_line;
-
-	/* During clustering, the top few edges we are considering.
-	 */
-	Edge edge[MAX_EDGES];
-
-	/* Scale and sum a set of hlines to make the final value. 
-	 */
-	int n_velement;
-	VElement velement[MAX_LINES];
-
-	/* And group those velements as vlines.
-	 */
-	int n_vline;
-	VLine vline[MAX_LINES];
-} Boxes;
-
-/* Euclid's algorithm. Use this to common up mults.
- */
-static int
-gcd( int a, int b )
-{
-	if( b == 0 )
-		return( abs( a ) );
-	else
-		return( gcd( b, a % b ) );
-}
-
-static void
-boxes_start( Boxes *boxes, int x )
-{
-	boxes->hline[boxes->n_hline].start = x;
-	boxes->hline[boxes->n_hline].weight = 1;
-}
-
-static int
-boxes_end( Boxes *boxes, int x, int y, int factor )
-{
-	boxes->hline[boxes->n_hline].end = x;
-
-	boxes->velement[boxes->n_velement].row = y;
-	boxes->velement[boxes->n_velement].band = boxes->n_hline;
-	boxes->velement[boxes->n_velement].factor = factor;
-
-	if( boxes->n_hline >= MAX_LINES - 1 ) {
-		vips_error( "im_aconv", "%s", _( "mask too complex" ) );
-		return( -1 );
-	}
-	boxes->n_hline += 1;
-
-	if( boxes->n_velement >= MAX_LINES - 1 ) {
-		vips_error( "im_aconv", "%s", _( "mask too complex" ) );
-		return( -1 );
-	}
-	boxes->n_velement += 1;
-
-	return( 0 );
-}
-
-#ifdef DEBUG
-static void
-boxes_hprint( Boxes *boxes )
-{
-	int x, y;
-
-	printf( "hlines:\n" );
-	printf( "   n   b   r  f   w\n" );
-	for( y = 0; y < boxes->n_velement; y++ ) {
-		int b = boxes->velement[y].band;
-
-		printf( "%4d %3d %3d %2d %3d ", 
-			y, b, 
-			boxes->velement[y].row, 
-			boxes->velement[y].factor,
-			boxes->hline[b].weight );
-		for( x = 0; x < 45; x++ ) {
-			int rx = x * (boxes->mask->xsize + 1) / 45;
-
-			if( rx >= boxes->hline[b].start && 
-				rx < boxes->hline[b].end )
-				printf( "#" );
-			else
-				printf( " " );
-		}
-		printf( " %3d .. %3d\n", 
-			boxes->hline[b].start, boxes->hline[b].end );
-	}
-}
-
-static void
-boxes_vprint( Boxes *boxes )
-{
-	int y;
-
-	printf( "%d vlines:\n", boxes->n_vline );
-	printf( "   n  b  f      s      e\n" );
-	for( y = 0; y < boxes->n_vline; y++ ) 
-		printf( "%4d %2d %2d == %3d .. %3d\n", y,
-			boxes->vline[y].band, 
-			boxes->vline[y].factor, 
-			boxes->vline[y].start, 
-			boxes->vline[y].end );
-
-	printf( "area = %d\n", boxes->area );
-	printf( "rounding = %d\n", boxes->rounding );
-	printf( "max_line = %d\n", boxes->max_line );
-}
-#endif /*DEBUG*/
-
-/* Break the mask into a set of lines.
- */
-static int
-boxes_break( Boxes *boxes )
-{
-	DOUBLEMASK *mask = boxes->mask;
-	const int size = mask->xsize * mask->ysize;
-
-	double max;
-	double min;
-	double depth;
-	int layers_above;
-	int layers_below;
-	int z, n, x, y;
-
-	/* Find mask range. We must always include the zero axis in the mask.
-	 */
-	max = 0;
-	min = 0;
-	for( n = 0; n < size; n++ ) {
-		max = IM_MAX( max, mask->coeff[n] );
-		min = IM_MIN( min, mask->coeff[n] );
-	}
-
-	VIPS_DEBUG_MSG( "boxes_new: min = %g, max = %g\n", min, max );
-
-	/* The zero axis must fall on a layer boundary. Estimate the
-	 * depth, find n-lines-above-zero, get exact depth, then calculate a
-	 * fixed n-lines which includes any negative parts.
-	 */
-	depth = (max - min) / boxes->n_layers;
-	layers_above = VIPS_CEIL( max / depth );
-	depth = max / layers_above;
-	layers_below = VIPS_FLOOR( min / depth );
-
-	boxes->n_layers = layers_above - layers_below;
-
-	VIPS_DEBUG_MSG( "boxes_new: depth = %g, n_layers = %d\n", 
-		depth, boxes->n_layers );
-
-	/* For each layer, generate a set of lines which are inside the
-	 * perimeter. Work down from the top.
-	 */
-	for( z = 0; z < boxes->n_layers; z++ ) {
-		/* How deep we are into the mask, as a double we can test
-		 * against. Add half the layer depth so we can easily find >50%
-		 * mask elements.
-		 */
-		double z_ph = max - (1 + z) * depth + depth / 2;
-
-		/* Odd, but we must avoid rounding errors that make us miss 0
-		 * in the line above.
-		 */
-		int z_positive = z < layers_above;
-
-		for( y = 0; y < mask->ysize; y++ ) {
-			int inside;
-
-			/* Start outside the perimeter.
-			 */
-			inside = 0;
-
-			for( x = 0; x < mask->xsize; x++ ) {
-				double coeff = MASK( mask, x, y );
-
-				/* The vertical line from mask[x, y] to 0 is 
-				 * inside. Is our current square (x, y) part 
-				 * of that line?
-				 */
-				if( (z_positive && coeff >= z_ph) ||
-					(!z_positive && coeff <= z_ph) ) {
-					if( !inside ) {
-						boxes_start( boxes, x );
-						inside = 1;
-					}
-				}
-				else {
-					if( inside ) {
-						if( boxes_end( boxes, x, y,
-							z_positive ? 1 : -1 ) )
-							return( -1 );
-						inside = 0;
-					}
-				}
-			}
-
-			if( inside && 
-				boxes_end( boxes, mask->xsize, y, 
-					z_positive ? 1 : -1 ) )
-				return( -1 );
-		}
-	}
-
-#ifdef DEBUG
-	VIPS_DEBUG_MSG( "boxes_new: generated %d boxes\n", boxes->n_hline );
-	boxes_hprint( boxes );
-#endif /*DEBUG*/
-
-	return( 0 );
-}
-
-/* The 'distance' between a pair of hlines.
- */
-static int
-boxes_distance( Boxes *boxes, int a, int b )
-{
-	g_assert( boxes->hline[a].weight > 0 && boxes->hline[b].weight > 0 );
-
-	return( abs( boxes->hline[a].start - boxes->hline[b].start ) + 
-		abs( boxes->hline[a].end - boxes->hline[b].end ) ); 
-}
-
-/* Merge two hlines. Line b is deleted, and any refs to b in vlines updated to
- * point at a.
- */
-static void
-boxes_merge( Boxes *boxes, int a, int b )
-{
-	int i;
-
-	/* Scale weights. 
-	 */
-	int fa = boxes->hline[a].weight;
-	int fb = boxes->hline[b].weight;
-	double w = (double) fb / (fa + fb);
-
-	/* New endpoints.
-	 */
-	boxes->hline[a].start += w * 
-		(boxes->hline[b].start - boxes->hline[a].start);
-	boxes->hline[a].end += w * 
-		(boxes->hline[b].end - boxes->hline[a].end);
-	boxes->hline[a].weight += boxes->hline[b].weight;
-
-	/* Update velement refs to b to refer to a instead.
-	 */
-	for( i = 0; i < boxes->n_velement; i++ )
-		if( boxes->velement[i].band == b )
-			boxes->velement[i].band = a;
-
-	/* Mark b to be deleted.
-	 */
-	boxes->hline[b].weight = 0;
-}
-
-static int
-edge_sortfn( const void *p1, const void *p2 )
-{
-	Edge *a = (Edge *) p1;
-	Edge *b = (Edge *) p2;
-
-	return( a->d - b->d );
-}
-
-/* Cluster in batches. Return non-zero if we merged some lines.
- *
- * This is not as accurate as rescanning the whole space on every merge, but
- * it's far faster.
- */
-static int
-boxes_cluster2( Boxes *boxes, int cluster )
-{
-	int i, j, k;
-	int worst;
-	int worst_i;
-	int merged;
-
-	for( i = 0; i < MAX_EDGES; i++ ) {
-		boxes->edge[i].a = -1;
-		boxes->edge[i].b = -1;
-		boxes->edge[i].d = 99999;
-	}
-	worst_i = 0;
-	worst = boxes->edge[worst_i].d;
-
-	for( i = 0; i < boxes->n_hline; i++ ) {
-		if( boxes->hline[i].weight == 0 )
-			continue;
-
-		for( j = i + 1; j < boxes->n_hline; j++ ) {
-			int distance;
-
-			if( boxes->hline[j].weight == 0 )
-				continue;
-
-			distance = boxes_distance( boxes, i, j ); 
-			if( distance < worst ) {
-				boxes->edge[worst_i].a = i;
-				boxes->edge[worst_i].b = j;
-				boxes->edge[worst_i].d = distance;
-
-				worst_i = 0;
-				worst = boxes->edge[worst_i].d;
-				for( k = 0; k < MAX_EDGES; k++ )
-					if( boxes->edge[k].d > worst ) {
-						worst = boxes->edge[k].d;
-						worst_i = k;
-					}
-			}
-		}
-	}
-
-	/* Sort to get closest first.
-	 */
-	qsort( boxes->edge, MAX_EDGES, sizeof( Edge ), edge_sortfn );
-
-	/*
-	printf( "edges:\n" );
-	printf( "  n   a   b  d:\n" );
-	for( i = 0; i < MAX_EDGES; i++ )
-		printf( "%2i) %3d %3d %3d\n", i, 
-			boxes->edge[i].a, boxes->edge[i].b, boxes->edge[i].d );
-	 */
-
-	/* Merge from the top down.
-	 */
-	merged = 0;
-	for( k = 0; k < MAX_EDGES; k++ ) {
-		Edge *edge = &boxes->edge[k];
-
-		if( edge->d > cluster )
-			break;
-
-		/* Has been removed, see loop below.
-		 */
-		if( edge->a == -1 )
-			continue;
-
-		boxes_merge( boxes, edge->a, edge->b );
-		merged = 1;
-
-		/* Nodes a and b have vanished or been moved. Remove any edges
-		 * which refer to them from the edge list,
-		 */
-		for( i = k; i < MAX_EDGES; i++ ) {
-			Edge *edgei = &boxes->edge[i];
-
-			if( edgei->a == edge->a ||
-				edgei->b == edge->a ||
-				edgei->a == edge->b ||
-				edgei->b == edge->b )
-				edgei->a = -1;
-		}
-	}
-
-	return( merged );
-}
-
-/* Renumber after clustering. We will have removed a lot of hlines ... shuffle
- * the rest down, adjust all the vline references.
- */
-static void
-boxes_renumber( Boxes *boxes )
-{
-	int i, j;
-
-	VIPS_DEBUG_MSG( "boxes_renumber: renumbering ...\n" );
-
-	/* Loop for all zero-weight hlines.
-	 */
-	for( i = 0; i < boxes->n_hline; ) {
-		if( boxes->hline[i].weight > 0 ) {
-			i++;
-			continue;
-		}
-
-		/* We move hlines i + 1 down, so we need to adjust all
-		 * band[] refs to match.
-		 */
-		for( j = 0; j < boxes->n_velement; j++ )
-			if( boxes->velement[j].band > i ) 
-				boxes->velement[j].band -= 1;
-
-		memmove( boxes->hline + i, boxes->hline + i + 1, 
-			sizeof( HLine ) * (boxes->n_hline - i - 1) );
-		boxes->n_hline -= 1;
-	}
-
-	VIPS_DEBUG_MSG( "boxes_renumber: ... %d hlines remain\n", 
-		boxes->n_hline );
-}
-
-/* Sort by band, then factor, then row.
- */
-static int
-velement_sortfn( const void *p1, const void *p2 )
-{
-	VElement *a = (VElement *) p1;
-	VElement *b = (VElement *) p2;
-
-	if( a->band != b->band )
-		return( a->band - b->band );
-
-	if( a->factor != b->factor )
-		return( a->factor - b->factor );
-
-	return( a->row - b->row );
-}
-
-static void
-boxes_vline( Boxes *boxes )
-{
-	int y, z;
-
-	VIPS_DEBUG_MSG( "boxes_vline: forming vlines ...\n" );
-
-	/* Sort to get elements which could form a vline together.
-	 */
-	qsort( boxes->velement, boxes->n_velement, sizeof( VElement ), 
-		velement_sortfn );
-
-#ifdef DEBUG
-	boxes_hprint( boxes );
-#endif /*DEBUG*/
-
-	/* If two lines have the same row and band, we can join them and knock
-	 * up the factor instead.
-	 */
-	for( y = 0; y < boxes->n_velement; y++ ) {
-		for( z = y + 1; z < boxes->n_velement; z++ )
-			if( boxes->velement[z].band != 
-				boxes->velement[y].band ||
-				boxes->velement[z].row != 
-					boxes->velement[y].row )
-				break;
-
-		boxes->velement[y].factor = z - y;
-		memmove( boxes->velement + y + 1, boxes->velement + z,
-			sizeof( VElement ) * (boxes->n_velement - z) );
-		boxes->n_velement -= z - y - 1;
-	}
-
-#ifdef DEBUG
-	printf( "after commoning up, %d velement remain\n", boxes->n_velement );
-	boxes_hprint( boxes );
-#endif /*DEBUG*/
-
-	boxes->n_vline = 0;
-	for( y = 0; y < boxes->n_velement; ) {
-		int n = boxes->n_vline;
-
-		/* Start of a line.
-		 */
-		boxes->vline[n].band = boxes->velement[y].band;
-		boxes->vline[n].factor = boxes->velement[y].factor;
-		boxes->vline[n].start = boxes->velement[y].row;
-
-		/* Search for the end of this line.
-		 */
-		for( z = y + 1; z < boxes->n_velement; z++ ) 
-			if( boxes->velement[z].band != 
-					boxes->vline[n].band ||
-				boxes->velement[z].factor != 
-					boxes->vline[n].factor ||
-				boxes->velement[z].row != 
-					boxes->vline[n].start + z - y )
-				break;
-
-		/* So the line ends at the previously examined element. We
-		 * want 'end' to be one beyond that (non-inclusive).
-		 */
-		boxes->vline[n].end = boxes->velement[z - 1].row + 1;
-
-		boxes->n_vline += 1;
-		y = z;
-	}
-
-	VIPS_DEBUG_MSG( "boxes_vline: found %d vlines\n", boxes->n_vline );
-}
-
-/* Break a mask into boxes.
- */
-static Boxes *
-boxes_new( IMAGE *in, IMAGE *out, DOUBLEMASK *mask, int n_layers, int cluster )
-{
-	const int size = mask->xsize * mask->ysize;
-
-	Boxes *boxes;
-	double sum;
-	int x, y, z;
-
-	/* Check parameters.
-	 */
-	if( im_piocheck( in, out ) ||
-		im_check_uncoded( "im_aconv", in ) ||
-		vips_check_dmask( "im_aconv", mask ) ) 
-		return( NULL );
-
-	boxes = VIPS_NEW( out, Boxes );
-	boxes->in = in;
-	boxes->out = out;
-	if( !(boxes->mask = (DOUBLEMASK *) im_local( out, 
-		(im_construct_fn) im_dup_dmask,
-		(im_callback_fn) im_free_dmask, mask, mask->filename, NULL )) )
-		return( NULL );
-	boxes->n_layers = n_layers;
-	boxes->cluster = cluster;
-
-	boxes->n_hline = 0;
-	boxes->n_velement = 0;
-	boxes->n_vline = 0;
-
-	/* Break into a set of hlines.
-	 */
-	if( boxes_break( boxes ) )
-		return( NULL );
-
-	/* Cluster to find groups of lines.
-	 */
-	VIPS_DEBUG_MSG( "boxes_new: clustering with thresh %d ...\n", cluster );
-	while( boxes_cluster2( boxes, cluster ) )
-		;
-
-	/* Renumber to remove holes created by clustering.
-	 */
-	boxes_renumber( boxes );
-
-	/* Find a set of vlines for the remaining hlines.
-	 */
-	boxes_vline( boxes );
-
-	/* Find the area of the lines and the length of the longest hline.
-	 */
-	boxes->area = 0;
-	boxes->max_line = 0;
-	for( y = 0; y < boxes->n_velement; y++ ) {
-		x = boxes->velement[y].band;
-		z = boxes->hline[x].end - boxes->hline[x].start;
-
-		boxes->area += boxes->velement[y].factor * z;
-		if( z > boxes->max_line )
-			boxes->max_line = z;
-	}
-
-	/* Strength reduction: if all lines are divisible by n, we can move
-	 * that n out into the ->area factor. The aim is to produce as many
-	 * factor 1 lines as we can and to reduce the chance of overflow.
-	 */
-	x = boxes->velement[0].factor;
-	for( y = 1; y < boxes->n_velement; y++ ) 
-		x = gcd( x, boxes->velement[y].factor );
-	for( y = 0; y < boxes->n_velement; y++ ) 
-		boxes->velement[y].factor /= x;
-	boxes->area *= x;
-
-	/* Find the area of the original mask.
-	 */
-	sum = 0;
-	for( z = 0; z < size; z++ ) 
-		sum += mask->coeff[z];
-
-	boxes->area = rint( sum * boxes->area / mask->scale );
-	boxes->rounding = (boxes->area + 1) / 2 + mask->offset * boxes->area;
-
-#ifdef DEBUG
-	boxes_hprint( boxes );
-	boxes_vprint( boxes );
-#endif /*DEBUG*/
-
-	/* With 512x512 tiles, each hline requires 3mb of intermediate per
-	 * thread ... 300 lines is about a gb per thread, ouch.
-	 */
-	if( boxes->n_hline > 150 ) {
-		im_error( "im_aconv", "%s", _( "mask too complex" ) );
-		return( NULL );
-	}
-
-	return( boxes );
-}
-
-/* Our sequence value.
- */
-typedef struct {
-	Boxes *boxes;
-
-	REGION *ir;		/* Input region */
-
-	/* Offsets for start and stop. 
-	 */
-	int *start;		
-	int *end;
-
-	int last_stride;	/* Avoid recalcing offsets, if we can */
-
-	/* The rolling sums. int for integer types, double for floating point 
-	 * types.
-	 */
-	void *sum;		
-} AConvSequence;
-
-/* Free a sequence value.
- */
-static int
-aconv_stop( void *vseq, void *a, void *b )
-{
-	AConvSequence *seq = (AConvSequence *) vseq;
-
-	IM_FREEF( im_region_free, seq->ir );
-
-	return( 0 );
-}
-
-/* Convolution start function.
- */
-static void *
-aconv_start( IMAGE *out, void *a, void *b )
-{
-	IMAGE *in = (IMAGE *) a;
-	Boxes *boxes = (Boxes *) b;
-
-	AConvSequence *seq;
-
-	if( !(seq = IM_NEW( out, AConvSequence )) )
-		return( NULL );
-
-	/* Init!
-	 */
-	seq->boxes = boxes;
-	seq->ir = im_region_create( in );
-
-	/* n_velement should be the largest possible dimension.
-	 */
-	g_assert( boxes->n_velement >= boxes->n_hline );
-	g_assert( boxes->n_velement >= boxes->n_vline );
-
-	seq->start = IM_ARRAY( out, boxes->n_velement, int );
-	seq->end = IM_ARRAY( out, boxes->n_velement, int );
-
-	if( vips_band_format_isint( out->BandFmt ) )
-		seq->sum = IM_ARRAY( out, boxes->n_velement, int );
-	else
-		seq->sum = IM_ARRAY( out, boxes->n_velement, double );
-	seq->last_stride = -1;
-
-	if( !seq->ir || !seq->start || !seq->end || !seq->sum ) {
-		aconv_stop( seq, in, boxes );
-		return( NULL );
-	}
-
-	return( seq );
-}
-
-/* The h and v loops are very similar, but also annoyingly different. Keep
- * them separate for easy debugging.
- */
-
-#define HCONV( IN, OUT ) \
-G_STMT_START { \
-	for( i = 0; i < bands; i++ ) { \
-		OUT *seq_sum = (OUT *) seq->sum; \
-		\
-		IN *p; \
-		OUT *q; \
-		\
-		p = i + (IN *) IM_REGION_ADDR( ir, r->left, r->top + y ); \
-		q = i * n_hline + \
-			(OUT *) IM_REGION_ADDR( or, r->left, r->top + y ); \
-		\
-		for( z = 0; z < n_hline; z++ ) { \
-			seq_sum[z] = 0; \
-			for( x = boxes->hline[z].start; \
-				x < boxes->hline[z].end; x++ ) \
-				seq_sum[z] += p[x * istride]; \
-			q[z] = seq_sum[z]; \
-		} \
-		q += ostride; \
-		\
-		for( x = 1; x < r->width; x++ ) {  \
-			for( z = 0; z < n_hline; z++ ) { \
-				seq_sum[z] += p[seq->end[z]]; \
-				seq_sum[z] -= p[seq->start[z]]; \
-				q[z] = seq_sum[z]; \
-			} \
-			p += istride; \
-			q += ostride; \
-		} \
-	} \
-} G_STMT_END
-
-/* Do horizontal masks ... we scan the mask along scanlines.
- */
-static int
-aconv_hgenerate( REGION *or, void *vseq, void *a, void *b )
-{
-	AConvSequence *seq = (AConvSequence *) vseq;
-	IMAGE *in = (IMAGE *) a;
-	Boxes *boxes = (Boxes *) b;
-
-	REGION *ir = seq->ir;
-	const int n_hline = boxes->n_hline;
-	DOUBLEMASK *mask = boxes->mask;
-	Rect *r = &or->valid;
-
-	/* Double the bands (notionally) for complex.
-	 */
-	int bands = vips_band_format_iscomplex( in->BandFmt ) ? 
-		2 * in->Bands : in->Bands;
-
-	Rect s;
-	int x, y, z, i;
-	int istride;
-	int ostride;
-
-	/* Prepare the section of the input image we need. A little larger
-	 * than the section of the output image we are producing.
-	 */
-	s = *r;
-	s.width += mask->xsize - 1;
-	if( im_prepare( ir, &s ) )
-		return( -1 );
-
-	istride = IM_IMAGE_SIZEOF_PEL( in ) / 
-		IM_IMAGE_SIZEOF_ELEMENT( in );
-	ostride = IM_IMAGE_SIZEOF_PEL( or->im ) / 
-		IM_IMAGE_SIZEOF_ELEMENT( or->im );
-
-        /* Init offset array. 
-	 */
-	if( seq->last_stride != istride ) {
-		seq->last_stride = istride;
-
-		for( z = 0; z < n_hline; z++ ) {
-			seq->start[z] = boxes->hline[z].start * istride;
-			seq->end[z] = boxes->hline[z].end * istride;
-		}
-	}
-
-	for( y = 0; y < r->height; y++ ) { 
-		switch( in->BandFmt ) {
-		case IM_BANDFMT_UCHAR: 	
-			if( boxes->max_line > 256 )
-				HCONV( unsigned char, unsigned int );
-			else
-				HCONV( unsigned char, unsigned short );
-			break;
-
-		case IM_BANDFMT_CHAR: 	
-			if( boxes->max_line > 256 )
-				HCONV( signed char, signed int );
-			else
-				HCONV( signed char, signed short );
-			break;
-
-		case IM_BANDFMT_USHORT: 	
-			HCONV( unsigned short, unsigned int );
-			break;
-
-		case IM_BANDFMT_SHORT: 	
-			HCONV( signed short, signed int );
-			break;
-
-		case IM_BANDFMT_UINT: 	
-			HCONV( unsigned int, unsigned int );
-			break;
-
-		case IM_BANDFMT_INT: 	
-			HCONV( signed int, signed int );
-			break;
-
-		case IM_BANDFMT_FLOAT: 	
-			HCONV( float, float );
-			break;
-
-		case IM_BANDFMT_DOUBLE: 	
-			HCONV( double, double );
-			break;
-
-		case IM_BANDFMT_COMPLEX: 	
-			HCONV( float, float );
-			break;
-
-		case IM_BANDFMT_DPCOMPLEX: 	
-			HCONV( double, double );
-			break;
-
-		default:
-			g_assert_not_reached();
-		}
-	}
-
-	return( 0 );
-}
-
-static int
-aconv_horizontal( Boxes *boxes, IMAGE *in, IMAGE *out )
-{
-	/* Prepare output. Consider a 7x7 mask and a 7x7 image --- the output
-	 * would be 1x1.
-	 */
-	if( im_cp_desc( out, in ) )
-		return( -1 );
-	out->Xsize -= boxes->mask->xsize - 1;
-	if( out->Xsize <= 0 ) { 
-		im_error( "im_aconv", "%s", _( "image too small for mask" ) );
-		return( -1 );
-	}
-	out->Bands *= boxes->n_hline;
-
-	/* Short u?char lines can use u?short intermediate.
-	 */
-	if( vips_band_format_isuint( in->BandFmt ) )
-		out->BandFmt = boxes->max_line < 256 ? 
-			IM_BANDFMT_USHORT : IM_BANDFMT_UINT;
-	else if( vips_band_format_isint( in->BandFmt ) )
-		out->BandFmt = boxes->max_line < 256 ? 
-			IM_BANDFMT_SHORT : IM_BANDFMT_INT;
-
-	if( im_demand_hint( out, IM_SMALLTILE, in, NULL ) ||
-		im_generate( out, 
-			aconv_start, aconv_hgenerate, aconv_stop, in, boxes ) )
-		return( -1 );
-
-	out->Xoffset = -boxes->mask->xsize / 2;
-	out->Yoffset = -boxes->mask->ysize / 2;
-
-	return( 0 );
-}
-
-#define CLIP_UCHAR( V ) \
-G_STMT_START { \
-	if( (V) < 0 ) \
-		(V) = 0; \
-	else if( (V) > UCHAR_MAX ) \
-		(V) = UCHAR_MAX; \
-} G_STMT_END
-
-#define CLIP_CHAR( V ) \
-G_STMT_START { \
-	if( (V) < SCHAR_MIN ) \
-		(V) = SCHAR_MIN; \
-	else if( (V) > SCHAR_MAX ) \
-		(V) = SCHAR_MAX; \
-} G_STMT_END
-
-#define CLIP_USHORT( V ) \
-G_STMT_START { \
-	if( (V) < 0 ) \
-		(V) = 0; \
-	else if( (V) > USHRT_MAX ) \
-		(V) = USHRT_MAX; \
-} G_STMT_END
-
-#define CLIP_SHORT( V ) \
-G_STMT_START { \
-	if( (V) < SHRT_MIN ) \
-		(V) = SHRT_MIN; \
-	else if( (V) > SHRT_MAX ) \
-		(V) = SHRT_MAX; \
-} G_STMT_END
-
-#define CLIP_NONE( V ) {}
-
-#define VCONV( ACC, IN, OUT, CLIP ) \
-G_STMT_START { \
-	for( x = 0; x < sz; x++ ) { \
-		ACC *seq_sum = (ACC *) seq->sum; \
-		\
-		IN *p; \
-		OUT *q; \
-		ACC sum; \
-		\
-		p = x * boxes->n_hline + \
-			(IN *) IM_REGION_ADDR( ir, r->left, r->top ); \
-		q = x + (OUT *) IM_REGION_ADDR( or, r->left, r->top ); \
-		\
-		sum = 0; \
-		for( z = 0; z < n_vline; z++ ) { \
-			seq_sum[z] = 0; \
-			for( k = boxes->vline[z].start; \
-				k < boxes->vline[z].end; k++ ) \
-				seq_sum[z] += p[k * istride + \
-					boxes->vline[z].band]; \
-			sum += boxes->vline[z].factor * seq_sum[z]; \
-		} \
-		sum = (sum + boxes->rounding) / boxes->area; \
-		CLIP( sum ); \
-		*q = sum; \
-		q += ostride; \
-		\
-		for( y = 1; y < r->height; y++ ) { \
-			sum = 0;\
-			for( z = 0; z < n_vline; z++ ) { \
-				seq_sum[z] += p[seq->end[z]]; \
-				seq_sum[z] -= p[seq->start[z]]; \
-				sum += boxes->vline[z].factor * seq_sum[z]; \
-			} \
-			p += istride; \
-			sum = (sum + boxes->rounding) / boxes->area; \
-			CLIP( sum ); \
-			*q = sum; \
-			q += ostride; \
-		} \
-	} \
-} G_STMT_END
-
-/* Do vertical masks ... we scan the mask down columns of pixels. 
- */
-static int
-aconv_vgenerate( REGION *or, void *vseq, void *a, void *b )
-{
-	AConvSequence *seq = (AConvSequence *) vseq;
-	IMAGE *in = (IMAGE *) a;
-	Boxes *boxes = (Boxes *) b;
-
-	REGION *ir = seq->ir;
-	const int n_vline = boxes->n_vline;
-	DOUBLEMASK *mask = boxes->mask;
-	Rect *r = &or->valid;
-
-	/* Double the width (notionally) for complex.
-	 */
-	int sz = vips_band_format_iscomplex( in->BandFmt ) ? 
-		2 * IM_REGION_N_ELEMENTS( or ) : IM_REGION_N_ELEMENTS( or );
-
-	Rect s;
-	int x, y, z, k;
-	int istride;
-	int ostride;
-
-	/* Prepare the section of the input image we need. A little larger
-	 * than the section of the output image we are producing.
-	 */
-	s = *r;
-	s.height += mask->ysize - 1;
-	if( im_prepare( ir, &s ) )
-		return( -1 );
-
-	istride = IM_REGION_LSKIP( ir ) / 
-		IM_IMAGE_SIZEOF_ELEMENT( in );
-	ostride = IM_REGION_LSKIP( or ) / 
-		IM_IMAGE_SIZEOF_ELEMENT( boxes->out );
-
-        /* Init offset array. 
-	 */
-	if( seq->last_stride != istride ) {
-		seq->last_stride = istride;
-
-		for( z = 0; z < n_vline; z++ ) {
-			seq->start[z] = boxes->vline[z].band + 
-				boxes->vline[z].start * istride;
-			seq->end[z] = boxes->vline[z].band + 
-				boxes->vline[z].end * istride;
-		}
-	}
-
-	switch( boxes->in->BandFmt ) {
-	case IM_BANDFMT_UCHAR: 	
-		if( boxes->max_line > 256 )
-			VCONV( unsigned int, \
-				unsigned int, unsigned char, CLIP_UCHAR );
-		else
-			VCONV( unsigned int, \
-				unsigned short, unsigned char, CLIP_UCHAR );
-		break;
-
-	case IM_BANDFMT_CHAR: 	
-		if( boxes->max_line > 256 )
-			VCONV( signed int, \
-				signed int, signed char, CLIP_UCHAR );
-		else
-			VCONV( signed int, \
-				signed short, signed char, CLIP_UCHAR );
-		break;
-
-	case IM_BANDFMT_USHORT: 	
-		VCONV( unsigned int, \
-			unsigned int, unsigned short, CLIP_USHORT );
-		break;
-
-	case IM_BANDFMT_SHORT: 	
-		VCONV( signed int, signed int, signed short, CLIP_SHORT );
-		break;
-
-	case IM_BANDFMT_UINT: 	
-		VCONV( unsigned int, unsigned int, unsigned int, CLIP_NONE );
-		break;
-
-	case IM_BANDFMT_INT: 	
-		VCONV( signed int, signed int, signed int, CLIP_NONE );
-		break;
-
-	case IM_BANDFMT_FLOAT: 	
-		VCONV( float, float, float, CLIP_NONE );
-		break;
-
-	case IM_BANDFMT_DOUBLE: 	
-		VCONV( double, double, double, CLIP_NONE );
-		break;
-
-	case IM_BANDFMT_COMPLEX: 	
-		VCONV( float, float, float, CLIP_NONE );
-		break;
-
-	case IM_BANDFMT_DPCOMPLEX: 	
-		VCONV( double, double, double, CLIP_NONE );
-		break;
-
-	default:
-		g_assert_not_reached();
-	}
-
-	return( 0 );
-}
-
-static int
-aconv_vertical( Boxes *boxes, IMAGE *in, IMAGE *out )
-{
-	/* Prepare output. Consider a 7x7 mask and a 7x7 image --- the output
-	 * would be 1x1.
-	 */
-	if( im_cp_desc( out, in ) )
-		return( -1 );
-	out->Ysize -= boxes->mask->ysize - 1;
-	if( out->Ysize <= 0 ) {
-		im_error( "im_aconv", "%s", _( "image too small for mask" ) );
-		return( -1 );
-	}
-	out->Bands = boxes->in->Bands;
-	out->BandFmt = boxes->in->BandFmt;
-
-	if( im_demand_hint( out, IM_SMALLTILE, in, NULL ) ||
-		im_generate( out, 
-			aconv_start, aconv_vgenerate, aconv_stop, in, boxes ) )
-		return( -1 );
-
-	out->Xoffset = -boxes->mask->xsize / 2;
-	out->Yoffset = -boxes->mask->ysize / 2;
-
-	return( 0 );
-}
-
-/**
- * im_aconv:
- * @in: input image
- * @out: output image
- * @mask: convolution mask
- * @n_layers: number of layers for approximation
- * @cluster: cluster lines closer than this distance
- *
- * Perform an approximate convolution of @in with @mask.
- *
- * The output image 
- * always has the same #VipsBandFormat as the input image. 
- *
- * Larger values for @n_layers give more accurate
- * results, but are slower. As @n_layers approaches the mask radius, the
- * accuracy will become close to exact convolution and the speed will drop to 
- * match. For many large masks, such as Gaussian, @n_layers need be only 10% of
- * this value and accuracy will still be good.
- *
- * Smaller values of @cluster will give more accurate results, but be slower
- * and use more memory. 10% of the mask radius is a good rule of thumb.
- *
- * See also: im_convsep_f(), im_create_dmaskv().
- *
- * Returns: 0 on success, -1 on error
- */
-int 
-im_aconv( IMAGE *in, IMAGE *out, DOUBLEMASK *mask, int n_layers, int cluster )
-{
-	IMAGE *t[2];
-	Boxes *boxes;
-
-	if( !(boxes = boxes_new( in, out, mask, n_layers, cluster )) ||
-		im_open_local_array( out, t, 2, "im_aconv", "p" ) )
-		return( -1 );
-
-	/*
-	 */
-	if( im_embed( in, t[0], 1, mask->xsize / 2, mask->ysize / 2, 
-		in->Xsize + mask->xsize - 1, in->Ysize + mask->ysize - 1 ) ||
-		aconv_horizontal( boxes, t[0], t[1] ) ||
-		aconv_vertical( boxes, t[1], out ) )
-		return( -1 );
-
-	/* For testing .. just try one direction.
-	if( aconv_horizontal( boxes, in, out ) )
-		return( -1 );
-	 */
-
-	out->Xoffset = 0;
-	out->Yoffset = 0;
-
-	return( 0 );
-}
-
diff --git a/libvips/convolution/im_aconvsep.c b/libvips/convolution/im_aconvsep.c
deleted file mode 100644
index 4bc0a810..00000000
--- a/libvips/convolution/im_aconvsep.c
+++ /dev/null
@@ -1,879 +0,0 @@
-/* im_aconvsep ... separable approximate convolution
- *
- * This operation does an approximate, seperable convolution. 
- *
- * Author: John Cupitt & Nicolas Robidoux
- * Written on: 31/5/11
- * Modified on: 
- * 31/5/11
- *      - from im_conv()
- */
-
-/*
-
-    This file is part of VIPS.
-    
-    VIPS is free software; you can redistribute it and/or modify
-    it under the terms of the GNU Lesser General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
-    02110-1301  USA
-
- */
-
-/*
-
-    These files are distributed with VIPS - http://www.vips.ecs.soton.ac.uk
-
- */
-
-/*
-
-  See:
-
-	http://incubator.quasimondo.com/processing/stackblur.pde
-
-  This thing is a little like stackblur, but generalised to any separable 
-  mask.
-
- */
-
-/*
-
-  TODO
-
-  	- are we handling mask offset correctly?
-
- */
-
-/* Show sample pixels as they are transformed.
-#define DEBUG_PIXELS
- */
-
-/*
-#define DEBUG
-#define VIPS_DEBUG
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif /*HAVE_CONFIG_H*/
-#include <vips/intl.h>
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <limits.h>
-#include <math.h>
-
-#include <vips/vips.h>
-#include <vips/vector.h>
-#include <vips/debug.h>
-
-/* Maximum number of lines we can break the mask into.
- */
-#define MAX_LINES (1000)
-
-/* Euclid's algorithm. Use this to common up mults.
- */
-static int
-gcd( int a, int b )
-{
-	if( b == 0 )
-		return( abs( a ) );
-	else
-		return( gcd( b, a % b ) );
-}
-
-/* A set of lines.
- */
-typedef struct _Lines {
-	/* Copy of our arguments.
-	 */
-	IMAGE *in;
-	IMAGE *out;
-	DOUBLEMASK *mask;
-	int n_layers;
-
-	int area;
-	int rounding;
-
-	/* Start is the left-most pixel in the line, end is one beyond the
-	 * right-most pixel.
-	 */
-	int n_lines;
-	int start[MAX_LINES];
-	int end[MAX_LINES];
-	int factor[MAX_LINES];
-} Lines;
-
-static void
-lines_start( Lines *lines, int x, int factor )
-{
-	lines->start[lines->n_lines] = x;
-	lines->factor[lines->n_lines] = factor;
-}
-
-static int
-lines_end( Lines *lines, int x )
-{
-	lines->end[lines->n_lines] = x;
-
-	if( lines->n_lines >= MAX_LINES - 1 ) {
-		vips_error( "im_aconvsep", "%s", _( "mask too complex" ) );
-		return( -1 );
-	}
-	lines->n_lines += 1;
-
-	return( 0 );
-}
-
-/* Break a mask into lines.
- */
-static Lines *
-lines_new( IMAGE *in, IMAGE *out, DOUBLEMASK *mask, int n_layers )
-{
-	const int width = mask->xsize * mask->ysize;
-
-	Lines *lines;
-	double max;
-	double min;
-	double depth;
-	double sum;
-	int layers_above;
-	int layers_below;
-	int z, n, x;
-
-	/* Check parameters.
-	 */
-	if( im_piocheck( in, out ) ||
-		im_check_uncoded( "im_aconvsep", in ) ||
-		vips_check_dmask_1d( "im_aconvsep", mask ) ) 
-		return( NULL );
-
-	lines = VIPS_NEW( out, Lines );
-	lines->in = in;
-	lines->out = out;
-	if( !(lines->mask = (DOUBLEMASK *) im_local( out, 
-		(im_construct_fn) im_dup_dmask,
-		(im_callback_fn) im_free_dmask, mask, mask->filename, NULL )) )
-		return( NULL );
-	lines->n_layers = n_layers;
-	lines->n_lines = 0;
-
-	VIPS_DEBUG_MSG( "lines_new: breaking into %d layers ...\n", n_layers );
-
-	/* Find mask range. We must always include the zero axis in the mask.
-	 */
-	max = 0;
-	min = 0;
-	for( x = 0; x < width; x++ ) {
-		if( mask->coeff[x] > max )
-			max = mask->coeff[x];
-		if( mask->coeff[x] < min )
-			min = mask->coeff[x];
-	}
-
-	/* The zero axis must fall on a layer boundary. Estimate the
-	 * depth, find n-lines-above-zero, get exact depth, then calculate a
-	 * fixed n-lines which includes any negative parts.
-	 */
-	depth = (max - min) / n_layers;
-	layers_above = ceil( max / depth );
-	depth = max / layers_above;
-	layers_below = floor( min / depth );
-	n_layers = layers_above - layers_below;
-
-	VIPS_DEBUG_MSG( "depth = %g, n_layers = %d\n", depth, n_layers );
-
-	/* For each layer, generate a set of lines which are inside the
-	 * perimeter. Work down from the top.
-	 */
-	for( z = 0; z < n_layers; z++ ) {
-		double y = max - (1 + z) * depth;
-
-		/* y plus half depth ... ie. the layer midpoint.
-		 */
-		double y_ph = y + depth / 2;
-
-		/* Odd, but we must avoid rounding errors that make us miss 0
-		 * in the line above.
-		 */
-		int y_positive = z < layers_above;
-
-		int inside;
-
-		/* Start outside the perimeter.
-		 */
-		inside = 0;
-
-		for( x = 0; x < width; x++ ) {
-			/* The vertical line from mask[z] to 0 is inside. Is
-			 * our current square (x, y) part of that line?
-			 */
-			if( (y_positive && mask->coeff[x] >= y_ph) ||
-				(!y_positive && mask->coeff[x] <= y_ph) ) {
-				if( !inside ) {
-					lines_start( lines, x, 
-						y_positive ? 1 : -1 );
-					inside = 1;
-				}
-			}
-			else {
-				if( inside ) {
-					if( lines_end( lines, x ) )
-						return( NULL );
-					inside = 0;
-				}
-			}
-		}
-
-		if( inside && 
-			lines_end( lines, width ) )
-			return( NULL );
-	}
-
-	/* Can we common up any lines? Search for lines with identical
-	 * start/end.
-	 */
-	for( z = 0; z < lines->n_lines; z++ ) {
-		for( n = z + 1; n < lines->n_lines; n++ ) {
-			if( lines->start[z] == lines->start[n] &&
-				lines->end[z] == lines->end[n] ) {
-				lines->factor[z] += lines->factor[n];
-
-				/* n can be deleted. Do this in a separate
-				 * pass below.
-				 */
-				lines->factor[n] = 0;
-			}
-		}
-	}
-
-	/* Now we can remove all factor 0 lines.
-	 */
-	for( z = 0; z < lines->n_lines; z++ ) {
-		if( lines->factor[z] == 0 ) {
-			for( x = z; x < lines->n_lines; x++ ) {
-				lines->start[x] = lines->start[x + 1];
-				lines->end[x] = lines->end[x + 1];
-				lines->factor[x] = lines->factor[x + 1];
-			}
-			lines->n_lines -= 1;
-		}
-	}
-
-	/* Find the area of the lines.
-	 */
-	lines->area = 0;
-	for( z = 0; z < lines->n_lines; z++ ) 
-		lines->area += lines->factor[z] * 
-			(lines->end[z] - lines->start[z]);
-
-	/* Strength reduction: if all lines are divisible by n, we can move
-	 * that n out into the ->area factor. The aim is to produce as many
-	 * factor 1 lines as we can and to reduce the chance of overflow.
-	 */
-	x = lines->factor[0];
-	for( z = 1; z < lines->n_lines; z++ ) 
-		x = gcd( x, lines->factor[z] );
-	for( z = 0; z < lines->n_lines; z++ ) 
-		lines->factor[z] /= x;
-	lines->area *= x;
-
-	/* Find the area of the original mask.
-	 */
-	sum = 0;
-	for( z = 0; z < width; z++ ) 
-		sum += mask->coeff[z];
-
-	lines->area = rint( sum * lines->area / mask->scale );
-	lines->rounding = (lines->area + 1) / 2 + mask->offset * lines->area;
-
-	/* ASCII-art layer drawing.
-	printf( "lines:\n" );
-	for( z = 0; z < lines->n_lines; z++ ) {
-		printf( "%3d - %2d x ", z, lines->factor[z] );
-		for( x = 0; x < 55; x++ ) {
-			int rx = x * (width + 1) / 55;
-
-			if( rx >= lines->start[z] && rx < lines->end[z] )
-				printf( "#" );
-			else
-				printf( " " );
-		}
-		printf( " %3d .. %3d\n", lines->start[z], lines->end[z] );
-	}
-	printf( "area = %d\n", lines->area );
-	printf( "rounding = %d\n", lines->rounding );
-	 */
-
-	return( lines );
-}
-
-/* Our sequence value.
- */
-typedef struct {
-	Lines *lines;
-	REGION *ir;		/* Input region */
-
-	int *start;		/* Offsets for start and stop */
-	int *end;
-
-	/* The sums for each line. int for integer types, double for floating
-	 * point types.
-	 */
-	void *sum;		
-
-	int last_stride;	/* Avoid recalcing offsets, if we can */
-} AConvSep;
-
-/* Free a sequence value.
- */
-static int
-aconvsep_stop( void *vseq, void *a, void *b )
-{
-	AConvSep *seq = (AConvSep *) vseq;
-
-	IM_FREEF( im_region_free, seq->ir );
-
-	return( 0 );
-}
-
-/* Convolution start function.
- */
-static void *
-aconvsep_start( IMAGE *out, void *a, void *b )
-{
-	IMAGE *in = (IMAGE *) a;
-	Lines *lines = (Lines *) b;
-
-	AConvSep *seq;
-
-	if( !(seq = IM_NEW( out, AConvSep )) )
-		return( NULL );
-
-	/* Init!
-	 */
-	seq->lines = lines;
-	seq->ir = im_region_create( in );
-	seq->start = IM_ARRAY( out, lines->n_lines, int );
-	seq->end = IM_ARRAY( out, lines->n_lines, int );
-	if( vips_band_format_isint( out->BandFmt ) )
-		seq->sum = IM_ARRAY( out, lines->n_lines, int );
-	else
-		seq->sum = IM_ARRAY( out, lines->n_lines, double );
-	seq->last_stride = -1;
-
-	if( !seq->ir || !seq->start || !seq->end || !seq->sum ) {
-		aconvsep_stop( seq, in, lines );
-		return( NULL );
-	}
-
-	return( seq );
-}
-
-#define CLIP_UCHAR( V ) \
-G_STMT_START { \
-	if( (V) < 0 ) \
-		(V) = 0; \
-	else if( (V) > UCHAR_MAX ) \
-		(V) = UCHAR_MAX; \
-} G_STMT_END
-
-#define CLIP_CHAR( V ) \
-G_STMT_START { \
-	if( (V) < SCHAR_MIN ) \
-		(V) = SCHAR_MIN; \
-	else if( (V) > SCHAR_MAX ) \
-		(V) = SCHAR_MAX; \
-} G_STMT_END
-
-#define CLIP_USHORT( V ) \
-G_STMT_START { \
-	if( (V) < 0 ) \
-		(V) = 0; \
-	else if( (V) > USHRT_MAX ) \
-		(V) = USHRT_MAX; \
-} G_STMT_END
-
-#define CLIP_SHORT( V ) \
-G_STMT_START { \
-	if( (V) < SHRT_MIN ) \
-		(V) = SHRT_MIN; \
-	else if( (V) > SHRT_MAX ) \
-		(V) = SHRT_MAX; \
-} G_STMT_END
-
-#define CLIP_NONE( V ) {}
-
-/* The h and v loops are very similar, but also annoyingly different. Keep
- * them separate for easy debugging.
- */
-
-#define HCONV_INT( TYPE, CLIP ) { \
-	for( i = 0; i < bands; i++ ) { \
-		int *seq_sum = (int *) seq->sum; \
-		\
-		TYPE *q; \
-		TYPE *p; \
-		int sum; \
-		\
-		p = i + (TYPE *) IM_REGION_ADDR( ir, r->left, r->top + y ); \
-		q = i + (TYPE *) IM_REGION_ADDR( or, r->left, r->top + y ); \
-		\
-		sum = 0; \
-		for( z = 0; z < n_lines; z++ ) { \
-			seq_sum[z] = 0; \
-			for( x = lines->start[z]; x < lines->end[z]; x++ ) \
-				seq_sum[z] += p[x * istride]; \
-			sum += lines->factor[z] * seq_sum[z]; \
-		} \
-		sum = (sum + lines->rounding) / lines->area; \
-		CLIP( sum ); \
-		*q = sum; \
-		q += ostride; \
-		\
-		for( x = 1; x < r->width; x++ ) {  \
-			sum = 0; \
-			for( z = 0; z < n_lines; z++ ) { \
-				seq_sum[z] += p[seq->end[z]]; \
-				seq_sum[z] -= p[seq->start[z]]; \
-				sum += lines->factor[z] * seq_sum[z]; \
-			} \
-			p += istride; \
-			sum = (sum + lines->rounding) / lines->area; \
-			CLIP( sum ); \
-			*q = sum; \
-			q += ostride; \
-		} \
-	} \
-}
-
-#define HCONV_FLOAT( TYPE ) { \
-	for( i = 0; i < bands; i++ ) { \
-		double *seq_sum = (double *) seq->sum; \
-		\
-		TYPE *q; \
-		TYPE *p; \
-		double sum; \
-		\
-		p = i + (TYPE *) IM_REGION_ADDR( ir, r->left, r->top + y ); \
-		q = i + (TYPE *) IM_REGION_ADDR( or, r->left, r->top + y ); \
-		\
-		sum = 0; \
-		for( z = 0; z < lines->n_lines; z++ ) { \
-			seq_sum[z] = 0; \
-			for( x = lines->start[z]; x < lines->end[z]; x++ ) \
-				seq_sum[z] += p[x * istride]; \
-			sum += lines->factor[z] * seq_sum[z]; \
-		} \
-		sum = sum / lines->area + mask->offset; \
-		*q = sum; \
-		q += ostride; \
-		\
-		for( x = 1; x < r->width; x++ ) {  \
-			sum = 0; \
-			for( z = 0; z < lines->n_lines; z++ ) { \
-				seq_sum[z] += p[seq->end[z]]; \
-				seq_sum[z] -= p[seq->start[z]]; \
-				sum += lines->factor[z] * seq_sum[z]; \
-			} \
-			p += istride; \
-			sum = sum / lines->area + mask->offset; \
-			*q = sum; \
-			q += ostride; \
-		} \
-	} \
-}
-
-/* Do horizontal masks ... we scan the mask along scanlines.
- */
-static int
-aconvsep_generate_horizontal( REGION *or, void *vseq, void *a, void *b )
-{
-	AConvSep *seq = (AConvSep *) vseq;
-	IMAGE *in = (IMAGE *) a;
-	Lines *lines = (Lines *) b;
-
-	REGION *ir = seq->ir;
-	const int n_lines = lines->n_lines;
-	DOUBLEMASK *mask = lines->mask;
-	Rect *r = &or->valid;
-
-	/* Double the bands (notionally) for complex.
-	 */
-	int bands = vips_band_format_iscomplex( in->BandFmt ) ? 
-		2 * in->Bands : in->Bands;
-
-	Rect s;
-	int x, y, z, i;
-	int istride;
-	int ostride;
-
-	/* Prepare the section of the input image we need. A little larger
-	 * than the section of the output image we are producing.
-	 */
-	s = *r;
-	s.width += mask->xsize - 1;
-	s.height += mask->ysize - 1;
-	if( im_prepare( ir, &s ) )
-		return( -1 );
-
-	/* Stride can be different for the vertical case, keep this here for
-	 * ease of direction change.
-	 */
-	istride = IM_IMAGE_SIZEOF_PEL( in ) / 
-		IM_IMAGE_SIZEOF_ELEMENT( in );
-	ostride = IM_IMAGE_SIZEOF_PEL( lines->out ) / 
-		IM_IMAGE_SIZEOF_ELEMENT( lines->out );
-
-        /* Init offset array. 
-	 */
-	if( seq->last_stride != istride ) {
-		seq->last_stride = istride;
-
-		for( z = 0; z < n_lines; z++ ) {
-			seq->start[z] = lines->start[z] * istride;
-			seq->end[z] = lines->end[z] * istride;
-		}
-	}
-
-	for( y = 0; y < r->height; y++ ) { 
-		switch( in->BandFmt ) {
-		case IM_BANDFMT_UCHAR: 	
-			HCONV_INT( unsigned char, CLIP_UCHAR );
-			break;
-
-		case IM_BANDFMT_CHAR: 	
-			HCONV_INT( signed char, CLIP_UCHAR );
-			break;
-
-		case IM_BANDFMT_USHORT: 	
-			HCONV_INT( unsigned short, CLIP_USHORT );
-			break;
-
-		case IM_BANDFMT_SHORT: 	
-			HCONV_INT( signed short, CLIP_SHORT );
-			break;
-
-		case IM_BANDFMT_UINT: 	
-			HCONV_INT( unsigned int, CLIP_NONE );
-			break;
-
-		case IM_BANDFMT_INT: 	
-			HCONV_INT( signed int, CLIP_NONE );
-			break;
-
-		case IM_BANDFMT_FLOAT: 	
-			HCONV_FLOAT( float );
-			break;
-
-		case IM_BANDFMT_DOUBLE: 	
-			HCONV_FLOAT( double );
-			break;
-
-		case IM_BANDFMT_COMPLEX: 	
-			HCONV_FLOAT( float );
-			break;
-
-		case IM_BANDFMT_DPCOMPLEX: 	
-			HCONV_FLOAT( double );
-			break;
-
-		default:
-			g_assert_not_reached();
-		}
-	}
-
-	return( 0 );
-}
-
-#define VCONV_INT( TYPE, CLIP ) { \
-	for( x = 0; x < sz; x++ ) { \
-		int *seq_sum = (int *) seq->sum; \
-		\
-		TYPE *q; \
-		TYPE *p; \
-		int sum; \
-		\
-		p = x + (TYPE *) IM_REGION_ADDR( ir, r->left, r->top ); \
-		q = x + (TYPE *) IM_REGION_ADDR( or, r->left, r->top ); \
-		\
-		sum = 0; \
-		for( z = 0; z < lines->n_lines; z++ ) { \
-			seq_sum[z] = 0; \
-			for( y = lines->start[z]; y < lines->end[z]; y++ ) \
-				seq_sum[z] += p[y * istride]; \
-			sum += lines->factor[z] * seq_sum[z]; \
-		} \
-		sum = (sum + lines->rounding) / lines->area; \
-		CLIP( sum ); \
-		*q = sum; \
-		q += ostride; \
-		\
-		for( y = 1; y < r->height; y++ ) { \
-			sum = 0; \
-			for( z = 0; z < lines->n_lines; z++ ) { \
-				seq_sum[z] += p[seq->end[z]]; \
-				seq_sum[z] -= p[seq->start[z]]; \
-				sum += lines->factor[z] * seq_sum[z]; \
-			} \
-			p += istride; \
-			sum = (sum + lines->rounding) / lines->area; \
-			CLIP( sum ); \
-			*q = sum; \
-			q += ostride; \
-		}   \
-	} \
-}
-
-#define VCONV_FLOAT( TYPE ) { \
-	for( x = 0; x < sz; x++ ) { \
-		double *seq_sum = (double *) seq->sum; \
-		\
-		TYPE *q; \
-		TYPE *p; \
-		double sum; \
-		\
-		p = x + (TYPE *) IM_REGION_ADDR( ir, r->left, r->top ); \
-		q = x + (TYPE *) IM_REGION_ADDR( or, r->left, r->top ); \
-		\
-		sum = 0; \
-		for( z = 0; z < lines->n_lines; z++ ) { \
-			seq_sum[z] = 0; \
-			for( y = lines->start[z]; y < lines->end[z]; y++ ) \
-				seq_sum[z] += p[y * istride]; \
-			sum += lines->factor[z] * seq_sum[z]; \
-		} \
-		sum = sum / lines->area + mask->offset; \
-		*q = sum; \
-		q += ostride; \
-		\
-		for( y = 1; y < r->height; y++ ) { \
-			sum = 0; \
-			for( z = 0; z < lines->n_lines; z++ ) { \
-				seq_sum[z] += p[seq->end[z]]; \
-				seq_sum[z] -= p[seq->start[z]]; \
-				sum += lines->factor[z] * seq_sum[z]; \
-			} \
-			p += istride; \
-			sum = sum / lines->area + mask->offset; \
-			*q = sum; \
-			q += ostride; \
-		}   \
-	} \
-}
-
-/* Do vertical masks ... we scan the mask down columns of pixels. Copy-paste
- * from above with small changes.
- */
-static int
-aconvsep_generate_vertical( REGION *or, void *vseq, void *a, void *b )
-{
-	AConvSep *seq = (AConvSep *) vseq;
-	IMAGE *in = (IMAGE *) a;
-	Lines *lines = (Lines *) b;
-
-	REGION *ir = seq->ir;
-	const int n_lines = lines->n_lines;
-	DOUBLEMASK *mask = lines->mask;
-	Rect *r = &or->valid;
-
-	/* Double the width (notionally) for complex.
-	 */
-	int sz = vips_band_format_iscomplex( in->BandFmt ) ? 
-		2 * IM_REGION_N_ELEMENTS( or ) : IM_REGION_N_ELEMENTS( or );
-
-	Rect s;
-	int x, y, z;
-	int istride;
-	int ostride;
-
-	/* Prepare the section of the input image we need. A little larger
-	 * than the section of the output image we are producing.
-	 */
-	s = *r;
-	s.width += mask->xsize - 1;
-	s.height += mask->ysize - 1;
-	if( im_prepare( ir, &s ) )
-		return( -1 );
-
-	/* Stride can be different for the vertical case, keep this here for
-	 * ease of direction change.
-	 */
-	istride = IM_REGION_LSKIP( ir ) / 
-		IM_IMAGE_SIZEOF_ELEMENT( lines->in );
-	ostride = IM_REGION_LSKIP( or ) / 
-		IM_IMAGE_SIZEOF_ELEMENT( lines->out );
-
-        /* Init offset array. 
-	 */
-	if( seq->last_stride != istride ) {
-		seq->last_stride = istride;
-
-		for( z = 0; z < n_lines; z++ ) {
-			seq->start[z] = lines->start[z] * istride;
-			seq->end[z] = lines->end[z] * istride;
-		}
-	}
-
-	switch( in->BandFmt ) {
-	case IM_BANDFMT_UCHAR: 	
-		VCONV_INT( unsigned char, CLIP_UCHAR );
-		break;
-
-	case IM_BANDFMT_CHAR: 	
-		VCONV_INT( signed char, CLIP_UCHAR );
-		break;
-
-	case IM_BANDFMT_USHORT: 	
-		VCONV_INT( unsigned short, CLIP_USHORT );
-		break;
-
-	case IM_BANDFMT_SHORT: 	
-		VCONV_INT( signed short, CLIP_SHORT );
-		break;
-
-	case IM_BANDFMT_UINT: 	
-		VCONV_INT( unsigned int, CLIP_NONE );
-		break;
-
-	case IM_BANDFMT_INT: 	
-		VCONV_INT( signed int, CLIP_NONE );
-		break;
-
-	case IM_BANDFMT_FLOAT: 	
-		VCONV_FLOAT( float );
-		break;
-
-	case IM_BANDFMT_DOUBLE: 	
-		VCONV_FLOAT( double );
-		break;
-
-	case IM_BANDFMT_COMPLEX: 	
-		VCONV_FLOAT( float );
-		break;
-
-	case IM_BANDFMT_DPCOMPLEX: 	
-		VCONV_FLOAT( double );
-		break;
-
-	default:
-		g_assert_not_reached();
-	}
-
-	return( 0 );
-}
-
-static int
-aconvsep_raw( IMAGE *in, IMAGE *out, DOUBLEMASK *mask, int n_layers )
-{
-	Lines *lines;
-	im_generate_fn generate;
-
-#ifdef DEBUG
-	printf( "aconvsep_raw: starting with matrix:\n" );
-	im_print_dmask( mask );
-#endif /*DEBUG*/
-
-	if( !(lines = lines_new( in, out, mask, n_layers )) )
-		return( -1 );
-
-	/* Prepare output. Consider a 7x7 mask and a 7x7 image --- the output
-	 * would be 1x1.
-	 */
-	if( im_cp_desc( out, in ) )
-		return( -1 );
-	out->Xsize -= mask->xsize - 1;
-	out->Ysize -= mask->ysize - 1;
-	if( out->Xsize <= 0 || out->Ysize <= 0 ) {
-		im_error( "im_aconvsep", "%s", _( "image too small for mask" ) );
-		return( -1 );
-	}
-
-	if( mask->xsize == 1 )
-		generate = aconvsep_generate_vertical;
-	else 
-		generate = aconvsep_generate_horizontal;
-
-	if( im_demand_hint( out, IM_SMALLTILE, in, NULL ) ||
-		im_generate( out, 
-			aconvsep_start, generate, aconvsep_stop, in, lines ) )
-		return( -1 );
-
-	out->Xoffset = -mask->xsize / 2;
-	out->Yoffset = -mask->ysize / 2;
-
-	return( 0 );
-}
-
-/**
- * im_aconvsep:
- * @in: input image
- * @out: output image
- * @mask: convolution mask
- * @n_layers: number of layers for approximation
- *
- * Perform an approximate separable convolution of @in with @mask.
- *
- * The mask must be 1xn or nx1 elements. 
- * The output image 
- * always has the same #VipsBandFormat as the input image. 
- *
- * The image is convolved twice: once with @mask and then again with @mask 
- * rotated by 90 degrees. 
- *
- * Larger values for @n_layers give more accurate
- * results, but are slower. As @n_layers approaches the mask radius, the
- * accuracy will become close to exact convolution and the speed will drop to 
- * match. For many large masks, such as Gaussian, @n_layers need be only 10% of
- * this value and accuracy will still be good.
- *
- * See also: im_convsep_f(), im_create_dmaskv().
- *
- * Returns: 0 on success, -1 on error
- */
-int 
-im_aconvsep( IMAGE *in, IMAGE *out, DOUBLEMASK *mask, int n_layers )
-{
-	IMAGE *t[2];
-	const int n_mask = mask->xsize * mask->ysize;
-	DOUBLEMASK *rmask;
-
-	if( im_open_local_array( out, t, 2, "im_aconvsep", "p" ) ||
-		!(rmask = (DOUBLEMASK *) im_local( out, 
-		(im_construct_fn) im_dup_dmask,
-		(im_callback_fn) im_free_dmask, mask, mask->filename, NULL )) )
-		return( -1 );
-
-	rmask->xsize = mask->ysize;
-	rmask->ysize = mask->xsize;
-
-	/*
-	 */
-	if( im_embed( in, t[0], 1, n_mask / 2, n_mask / 2, 
-		in->Xsize + n_mask - 1, in->Ysize + n_mask - 1 ) ||
-		aconvsep_raw( t[0], t[1], mask, n_layers ) ||
-		aconvsep_raw( t[1], out, rmask, n_layers ) )
-		return( -1 );
-
-	/* For testing .. just try one direction.
-	if( aconvsep_raw( in, out, mask, n_layers ) )
-		return( -1 );
-	 */
-
-	out->Xoffset = 0;
-	out->Yoffset = 0;
-
-	return( 0 );
-}
-
diff --git a/libvips/convolution/im_conv.c b/libvips/convolution/im_conv.c
deleted file mode 100644
index 1dc477de..00000000
--- a/libvips/convolution/im_conv.c
+++ /dev/null
@@ -1,1106 +0,0 @@
-/* im_conv
- *
- * Copyright: 1990, N. Dessipris.
- *
- * Author: Nicos Dessipris & Kirk Martinez
- * Written on: 29/04/1991
- * Modified on: 19/05/1991
- * 8/7/93 JC
- *      - adapted for partial v2
- *      - memory leaks fixed
- *      - ANSIfied
- * 23/7/93 JC
- *	- inner loop unrolled with a switch - 25% speed-up!
- * 13/12/93 JC
- *	- tiny rounding error removed
- * 7/10/94 JC
- *	- new IM_ARRAY() macro
- *	- various simplifications
- *	- evalend callback added
- * 1/2/95 JC
- *	- use of IM_REGION_ADDR() updated
- *	- output size was incorrect! see comment below
- *	- bug with large non-square matricies fixed too
- *	- uses new im_embed() function
- * 13/7/98 JC
- *	- wierd bug ... im_free_imask is no longer directly called for close
- *	  callback, caused SIGKILL on solaris 2.6 ... linker bug?
- * 9/3/01 JC
- *	- reworked and simplified, about 10% faster
- *	- slightly better range clipping
- * 27/7/01 JC
- *	- reject masks with scale == 0
- * 7/4/04 
- *	- im_conv() now uses im_embed() with edge stretching on the input, not
- *	  the output
- *	- sets Xoffset / Yoffset
- * 11/11/05
- * 	- simpler inner loop avoids gcc4 bug 
- * 7/11/07
- * 	- new evalstart/end callbacks
- * 12/5/08
- * 	- int rounding was +1 too much, argh
- * 	- only rebuild the buffer offsets if bpl changes
- * 5/4/09
- * 	- tiny speedups and cleanups
- * 	- add restrict, though it doesn't seem to help gcc
- * 12/11/09
- * 	- only check for non-zero elements once
- * 	- add mask-all-zero check
- * 	- cleanups
- * 3/2/10
- * 	- gtkdoc
- * 	- more cleanups
- * 23/08/10
- * 	- add a special case for 3x3 masks, about 20% faster
- * 1/10/10
- * 	- support complex (just double the bands)
- * 18/10/10
- * 	- add experimental Orc path
- * 29/10/10
- * 	- use VipsVector
- * 	- get rid of im_convsep(), just call this twice, no longer worth
- * 	  keeping two versions
- * 8/11/10
- * 	- add array tiling
- * 9/5/11
- * 	- argh typo in overflow estimation could cause errors
- * 15/10/11 Nicolas
- * 	- handle offset correctly in seperable convolutions
- * 26/1/16 Lovell Fuller
- * 	- remove Duff for a 25% speedup
- */
-
-/*
-
-    This file is part of VIPS.
-    
-    VIPS is free software; you can redistribute it and/or modify
-    it under the terms of the GNU Lesser General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
-    02110-1301  USA
-
- */
-
-/*
-
-    These files are distributed with VIPS - http://www.vips.ecs.soton.ac.uk
-
- */
-
-/* Show sample pixels as they are transformed.
-#define DEBUG_PIXELS
- */
-
-/*
-#define DEBUG
- */
-
-/* 
-
- 	TODO
-
-	- tried 8-bit data with a 32-bit intermediate, but it was only 
-	  slightly faster than C
-
-	  16-bit data would be even slower, no speed advantage
-
-	- make up a signed 8-bit code path?
-
-	- don't use divluw, it's insanely slow, instead scale coefficients so 
-	  that we can just do >>8 at the end
-
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif /*HAVE_CONFIG_H*/
-#include <vips/intl.h>
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <limits.h>
-
-#include <vips/vips.h>
-#include <vips/vector.h>
-
-/* We can't run more than this many passes. Larger than this and we
- * fall back to C.
- */
-#define MAX_PASS (10)
-
-/* A pass with a vector. 
- */
-typedef struct {
-	int first;		/* The index of the first mask coff we use */
-	int last;		/* The index of the last mask coff we use */
-
-	int r;			/* Set previous result in this var */
-
-        /* The code we generate for this section of this mask. 
-	 */
-        VipsVector *vector;
-} Pass;
-
-/* Our parameters ... we take a copy of the mask argument, plus we make a
- * smaller version with the zeros squeezed out. 
- */
-typedef struct {
-	IMAGE *in;
-	IMAGE *out;
-	INTMASK *mask;		/* Copy of mask arg */
-
-	int nnz;		/* Number of non-zero mask elements */
-	int *coeff;		/* Array of non-zero mask coefficients */
-	int *coeff_pos;		/* Index of each nnz element in mask->coeff */
-
-	int underflow;		/* Global underflow/overflow counts */
-	int overflow;
-
-	/* The convolver we generate for this mask. We have to split the
-	 * convolve and clip into two phases.
-	 */
-	int n_pass;	
-	Pass pass[MAX_PASS];
-	int s1;			/* Input to clip */
-	VipsVector *clip;
-} Conv;
-
-static void
-conv_vector_free( Conv *conv )
-{
-	int i;
-
-	for( i = 0; i < conv->n_pass; i++ )
-		IM_FREEF( vips_vector_free, conv->pass[i].vector );
-	conv->n_pass = 0;
-
-	IM_FREEF( vips_vector_free, conv->clip );
-}
-
-static int
-conv_close( Conv *conv )
-{
-	IM_FREEF( im_free_imask, conv->mask );
-	conv_vector_free( conv );
-
-        return( 0 );
-}
-
-static int
-conv_evalstart( Conv *conv )
-{
-	/* Reset underflow/overflow count.
-	 *
-	 * This often doesn't get called until eval has already finished, so
-	 * resetting here just wipes all records. 
-	 *
-	conv->overflow = 0;
-	conv->underflow = 0;
-	 *
-	 */
-
-        return( 0 );
-}
-
-static int
-conv_evalend( Conv *conv )
-{
-	if( conv->overflow ) 
-		vips_info( "im_conv", 
-			_( "%d overflows detected" ), conv->overflow ); 
-	if( conv->underflow )
-		vips_info( "im_conv", 
-			_( "%d underflows detected" ), conv->underflow );
-
-        return( 0 );
-}
-
-#define TEMP( N, S ) vips_vector_temporary( v, N, S )
-#define SCANLINE( N, P, S ) vips_vector_source_scanline( v, N, P, S )
-#define CONST( N, V, S ) vips_vector_constant( v, N, V, S )
-#define ASM2( OP, A, B ) vips_vector_asm2( v, OP, A, B )
-#define ASM3( OP, A, B, C ) vips_vector_asm3( v, OP, A, B, C )
-
-/* Generate code for a section of the mask.
- *
- * 0 for success, -1 on error.
- */
-static int
-conv_compile_convolution_u8s16_section( Pass *pass, 
-	Conv *conv, gboolean first_pass )
-{
-	INTMASK *mask = conv->mask;
-	const int n_mask = mask->xsize * mask->ysize; 
-
-	int i;
-	VipsVector *v;
-	char zero[256];
-	char offset[256];
-	char source[256];
-	char coeff[256];
-
-	pass->vector = v = vips_vector_new( "conv", 2 );
-
-	/* The value we fetch from the image, the product with the matrix
-	 * value, the accumulated sum.
-	 */
-	TEMP( "value", 1 );
-	TEMP( "product", 2 );
-	TEMP( "sum", 2 );
-
-	/* Init the sum. If this is the first pass, it's a constant. If this
-	 * is a later pass, we have to init the sum from the result 
-	 * of the previous pass. 
-	 */
-	if( first_pass ) {
-		CONST( zero, 0, 2 );
-		ASM2( "copyw", "sum", zero );
-	}
-	else {
-		/* "r" is the result of the previous pass. 
-		 */
-		pass->r = vips_vector_source_name( v, "r", 2 );
-		ASM2( "loadw", "sum", "r" );
-	}
-
-	for( i = pass->first; i < n_mask; i++ ) {
-		int x = i % mask->xsize;
-		int y = i / mask->xsize;
-
-		if( !mask->coeff[i] )
-			/* Exclude zero elements.
-			 */
-			continue;
-
-		/* The source. sl0 is the first scanline in the mask.
-		 */
-		SCANLINE( source, y, 1 );
-
-		/* The offset, only for non-first-columns though.
-		 */
-		if( x > 0 ) 
-			CONST( offset, conv->in->Bands * x, 1 );
-
-		/* The coefficient. Only for non-1 coeffs though, we skip the
-		 * mul for them.
-		 *
-		 * We need to do 8-bit unsigned pixel * signed mask, so we
-		 * have to cast the pixel up to 16-bit then do a mult against a
-		 * 16-bit constant. We know the result will fit in the bottom
-		 * 16 bits.
-		 */
-		if( mask->coeff[i] != 1 ) 
-			CONST( coeff, mask->coeff[i], 2 );
-
-		/* Two factors: 
-		 * - element is in the first column, ie. has a zero offset
-		 * - mask coeff is 1, ie. we can skip the multiply
-		 *
-		 * We could combine some of these cases, but it's simpler
-		 * and safer to spell them all out.
-		 */
-		if( x == 0 ) 
-			ASM2( "loadb", "value", source );
-		else 
-			ASM3( "loadoffb", "value", source, offset );
-
-		ASM2( "convubw", "product", "value" );
-
-		if( mask->coeff[i] != 1 ) 
-			ASM3( "mullw", "product", "product", coeff );
-
-		ASM3( "addssw", "sum", "sum", "product" );
-
-		if( vips_vector_full( v ) )
-			break;
-	}
-
-	pass->last = i;
-
-	ASM2( "copyw", "d1", "sum" );
-
-#ifdef DEBUG
-	vips_vector_print( v );
-	printf( "compiling ...\n" );
-#endif /*DEBUG*/
-
-	if( !vips_vector_compile( v ) ) 
-		return( -1 );
-
-	return( 0 );
-}
-
-/* Generate the convolution pass for u8 data with an s16 accumulator.
- *
- * 0 for success, -1 on error.
- */
-static int
-conv_compile_convolution_u8s16( Conv *conv )
-{
-	INTMASK *mask = conv->mask;
-	const int n_mask = mask->xsize * mask->ysize; 
-
-	double min, max;
-	int i;
-
-	if( conv->in->BandFmt != IM_BANDFMT_UCHAR )
-		return( -1 );
-
-	/* Can the accumulator overflow or underflow at any stage? Since
-	 * matrix elements are signed, we need to calculate a running 
-	 * possible min and max.
-	 */
-	min = 0;
-	max = 0;
-	for( i = 0; i < n_mask; i++ ) {
-		int v = 255 * mask->coeff[i];
-
-		min = IM_MIN( min, min + v );
-		max = IM_MAX( max, max + v );
-
-		if( max > SHRT_MAX )
-			return( -1 );
-		if( min < SHRT_MIN )
-			return( -1 );
-	}
-
-	/* Generate passes until we've used up the whole mask.
-	 */
-	for( i = 0;;) {
-		Pass *pass;
-
-		/* Skip any zero coefficients at the start of the mask 
-		 * region.
-		 */
-		for( ; i < n_mask && !mask->coeff[i]; i++ )
-			;
-		if( i == n_mask )
-			break;
-
-		/* Allocate space for another pass.
-		 */
-		if( conv->n_pass == MAX_PASS ) 
-			return( -1 );
-		pass = &conv->pass[conv->n_pass];
-		conv->n_pass += 1;
-
-		pass->first = i;
-		pass->last = i;
-		pass->r = -1;
-
-		if( conv_compile_convolution_u8s16_section( pass, 
-			conv, conv->n_pass == 1 ) )
-			return( -1 );
-		i = pass->last + 1;
-
-#ifdef DEBUG
-		printf( "conv_compile_convolution_u8s16: "
-			"first = %d, last = %d\n", 
-			pass->first, pass->last ); 
-#endif /*DEBUG*/
-
-		if( i >= n_mask )
-			break;
-	}
-
-	return( 0 );
-}
-
-/* Generate the program that does (pass + rounding) / scale + offset 
- * from a s16 intermediate back to a u8 output.
- */
-static int
-conv_compile_scale_s16u8( Conv *conv )
-{
-	INTMASK *mask = conv->mask;
-
-	VipsVector *v;
-	char scale[256];
-	char offset[256];
-	char zero[256];
-
-	/* Scale and offset must be in range.
-	 */
-	if( mask->scale > 255 ||
-		mask->scale < 0 ||
-		mask->offset > SHRT_MAX ||
-		mask->offset < SHRT_MIN ) 
-		return( -1 );
-
-	conv->clip = v = vips_vector_new( "clip", 1 );
-	conv->s1 = vips_vector_source_name( v, "s1", 2 );
-
-	TEMP( "t1", 2 );
-	TEMP( "t2", 2 );
-
-	/* We can only do unsigned divide, so we must add the offset before
-	 * dividing by the scale. We need to scale the offset up.
-	 *
-	 * We can build the rounding into the offset as well.
-	 * You might think this should be (scale + 1) / 2, but then we'd be 
-	 * adding one for scale == 1.
-	 */
-	CONST( scale, mask->scale, 1 );
-	CONST( offset, mask->offset * mask->scale + mask->scale / 2, 2 );
-	CONST( zero, 0, 2 );
-
-	/* Offset and scale. 
-	 */
-	ASM3( "addssw", "t1", "s1", offset );
-
-	/* We need to convert the signed result of the
-	 * offset to unsigned for the div, ie. we want to set anything <0 to 0.
-	 */
-	ASM3( "cmpgtsw", "t2", "t1", zero );
-	ASM3( "andw", "t1", "t1", "t2" );
-
-	ASM3( "divluw", "t1", "t1", scale );
-	ASM2( "convuuswb", "d1", "t1" );
-
-	if( !vips_vector_compile( v ) ) 
-		return( -1 );
-
-#ifdef DEBUG
-	vips_vector_print( v );
-#endif /*DEBUG*/
-
-	return( 0 );
-}
-
-static Conv *
-conv_new( IMAGE *in, IMAGE *out, INTMASK *mask )
-{
-        Conv *conv = IM_NEW( out, Conv );
-	const int n_mask = mask->xsize * mask->ysize;
-        int i;
-
-        if( !conv )
-                return( NULL );
-
-        conv->in = in;
-        conv->out = out;
-        conv->mask = NULL;
-        conv->nnz = 0;
-        conv->coeff = NULL;
-        conv->coeff_pos = NULL;
-        conv->underflow = 0;
-        conv->overflow = 0;
-
-	conv->n_pass = 0;
-	conv->s1 = -1;
-	conv->clip = NULL;
-
-        if( im_add_close_callback( out, 
-		(im_callback_fn) conv_close, conv, NULL ) ||
-		im_add_close_callback( out, 
-			(im_callback_fn) conv_evalstart, conv, NULL ) ||
-		im_add_close_callback( out, 
-			(im_callback_fn) conv_evalend, conv, NULL ) ||
-        	!(conv->coeff = IM_ARRAY( out, n_mask, int )) ||
-        	!(conv->coeff_pos = IM_ARRAY( out, n_mask, int )) ||
-        	!(conv->mask = im_dup_imask( mask, "conv_mask" )) )
-                return( NULL );
-
-        /* Find non-zero mask elements.
-         */
-        for( i = 0; i < n_mask; i++ )
-                if( mask->coeff[i] ) {
-			conv->coeff[conv->nnz] = mask->coeff[i];
-			conv->coeff_pos[conv->nnz] = i;
-			conv->nnz += 1;
-		}
-
-	/* Was the whole mask zero? We must have at least 1 element in there:
-	 * set it to zero.
-	 */
-	if( conv->nnz == 0 ) {
-		conv->coeff[0] = mask->coeff[0];
-		conv->coeff_pos[0] = 0;
-		conv->nnz = 1;
-	}
-
-	/* Generate code for this mask / image, if possible.
-	 */
-	if( vips_vector_isenabled() ) {
-		if( conv_compile_convolution_u8s16( conv ) ||
-			conv_compile_scale_s16u8( conv ) ) 
-			conv_vector_free( conv );
-	}
-
-        return( conv );
-}
-
-/* Our sequence value.
- */
-typedef struct {
-	Conv *conv;
-	REGION *ir;		/* Input region */
-
-	int *offsets;		/* Offsets for each non-zero matrix element */
-	VipsPel **pts;		/* Per-non-zero mask element pointers */
-
-	int underflow;		/* Underflow/overflow counts */
-	int overflow;
-
-	int last_bpl;		/* Avoid recalcing offsets, if we can */
-
-	/* We need a pair of intermediate buffers to keep the results of each
-	 * conv pass in. 
-	 */
-	void *t1;
-	void *t2;
-} ConvSequence;
-
-/* Free a sequence value.
- */
-static int
-conv_stop( void *vseq, void *a, void *b )
-{
-	ConvSequence *seq = (ConvSequence *) vseq;
-	Conv *conv = (Conv *) b;
-
-	/* Add local under/over counts to global counts.
-	 */
-	conv->overflow += seq->overflow;
-	conv->underflow += seq->underflow;
-
-	IM_FREEF( im_region_free, seq->ir );
-	IM_FREE( seq->t1 );
-	IM_FREE( seq->t2 );
-
-	return( 0 );
-}
-
-/* Convolution start function.
- */
-static void *
-conv_start( IMAGE *out, void *a, void *b )
-{
-	IMAGE *in = (IMAGE *) a;
-	Conv *conv = (Conv *) b;
-
-	ConvSequence *seq;
-
-	if( !(seq = IM_NEW( out, ConvSequence )) )
-		return( NULL );
-
-	/* Init!
-	 */
-	seq->conv = conv;
-	seq->ir = NULL;
-	seq->pts = NULL;
-	seq->underflow = 0;
-	seq->overflow = 0;
-	seq->last_bpl = -1;
-	seq->t1 = NULL;
-	seq->t2 = NULL;
-
-	/* Attach region and arrays.
-	 */
-	seq->ir = im_region_create( in );
-	seq->offsets = IM_ARRAY( out, conv->nnz, int );
-	seq->pts = IM_ARRAY( out, conv->nnz, VipsPel * );
-	if( !seq->ir || !seq->offsets || !seq->pts ) {
-		conv_stop( seq, in, conv );
-		return( NULL );
-	}
-
-	if( vips_vector_isenabled() && 
-		conv->n_pass ) {
-		seq->t1 = IM_ARRAY( NULL, IM_IMAGE_N_ELEMENTS( in ), short );
-		seq->t2 = IM_ARRAY( NULL, IM_IMAGE_N_ELEMENTS( in ), short );
-
-		if( !seq->t1 || !seq->t2 ) {
-			conv_stop( seq, in, conv );
-			return( NULL );
-		}
-	}
-
-	return( seq );
-}
-
-/* INT inner loops.
- */
-#define CONV_INT( TYPE, IM_CLIP ) { \
-	TYPE ** restrict p = (TYPE **) seq->pts; \
-	TYPE * restrict q = (TYPE *) IM_REGION_ADDR( or, le, y ); \
-	\
-	for( x = 0; x < sz; x++ ) {  \
-		int sum; \
-		int i; \
-		\
-		sum = 0; \
-		for ( i = 0; i < nnz; i++ ) \
-			sum += t[i] * p[i][x]; \
-		\
-		sum = ((sum + rounding) / mask->scale) + mask->offset; \
-		\
-		IM_CLIP; \
-		\
-		q[x] = sum;  \
-	}  \
-} 
-
-/* FLOAT inner loops.
- */
-#define CONV_FLOAT( TYPE ) { \
-	TYPE ** restrict p = (TYPE **) seq->pts; \
-	TYPE * restrict q = (TYPE *) IM_REGION_ADDR( or, le, y ); \
-	\
-	for( x = 0; x < sz; x++ ) {  \
-		double sum; \
-		int i; \
-		\
-		sum = 0; \
-		for ( i = 0; i < nnz; i++ ) \
-			sum += t[i] * p[i][x]; \
- 		\
-		sum = (sum / mask->scale) + mask->offset; \
-		\
-		q[x] = sum;  \
-	}  \
-} 
-
-/* Convolve! See below for the special-case 3x3 path.
- */
-static int
-conv_gen( REGION *or, void *vseq, void *a, void *b )
-{
-	ConvSequence *seq = (ConvSequence *) vseq;
-	IMAGE *in = (IMAGE *) a;
-	Conv *conv = (Conv *) b;
-	REGION *ir = seq->ir;
-	INTMASK *mask = conv->mask;
-	int * restrict t = conv->coeff; 
-	const int nnz = conv->nnz;
-
-	/* You might think this should be (scale + 1) / 2, but then we'd be 
-	 * adding one for scale == 1.
-	 */
-	int rounding = mask->scale / 2;
-
-	Rect *r = &or->valid;
-	Rect s;
-	int le = r->left;
-	int to = r->top;
-	int bo = IM_RECT_BOTTOM( r );
-	int sz = IM_REGION_N_ELEMENTS( or ) * (im_iscomplex( in ) ? 2 : 1);
-
-	int x, y, z, i;
-
-	/* Prepare the section of the input image we need. A little larger
-	 * than the section of the output image we are producing.
-	 */
-	s = *r;
-	s.width += mask->xsize - 1;
-	s.height += mask->ysize - 1;
-	if( im_prepare( ir, &s ) )
-		return( -1 );
-
-	VIPS_GATE_START( "conv_gen: work" ); 
-
-        /* Fill offset array. Only do this if the bpl has changed since the 
-	 * previous im_prepare().
-	 */
-	if( seq->last_bpl != IM_REGION_LSKIP( ir ) ) {
-		seq->last_bpl = IM_REGION_LSKIP( ir );
-
-		for( i = 0; i < nnz; i++ ) {
-			z = conv->coeff_pos[i];
-			x = z % conv->mask->xsize;
-			y = z / conv->mask->xsize;
-
-			seq->offsets[i] = 
-				IM_REGION_ADDR( ir, x + le, y + to ) -
-				IM_REGION_ADDR( ir, le, to );
-		}
-	}
-
-	for( y = to; y < bo; y++ ) { 
-		/* Init pts for this line of PELs.
-		 */
-                for( z = 0; z < nnz; z++ )
-                        seq->pts[z] = seq->offsets[z] +  
-                                IM_REGION_ADDR( ir, le, y ); 
-
-		switch( in->BandFmt ) {
-		case IM_BANDFMT_UCHAR: 	
-			CONV_INT( unsigned char, IM_CLIP_UCHAR( sum, seq ) ); 
-			break;
-
-		case IM_BANDFMT_CHAR:   
-			CONV_INT( signed char, IM_CLIP_CHAR( sum, seq ) ); 
-			break;
-
-		case IM_BANDFMT_USHORT: 
-			CONV_INT( unsigned short, IM_CLIP_USHORT( sum, seq ) ); 
-			break;
-
-		case IM_BANDFMT_SHORT:  
-			CONV_INT( signed short, IM_CLIP_SHORT( sum, seq ) ); 
-			break;
-
-		case IM_BANDFMT_UINT:   
-			CONV_INT( unsigned int, IM_CLIP_NONE( sum, seq ) ); 
-			break;
-
-		case IM_BANDFMT_INT:    
-			CONV_INT( signed int, IM_CLIP_NONE( sum, seq ) ); 
-			break;
-
-		case IM_BANDFMT_FLOAT:  
-		case IM_BANDFMT_COMPLEX:  
-			CONV_FLOAT( float ); 
-			break;
-
-		case IM_BANDFMT_DOUBLE: 
-		case IM_BANDFMT_DPCOMPLEX:  
-			CONV_FLOAT( double ); 
-			break;
-
-		default:
-			g_assert_not_reached();
-		}
-	}
-
-	VIPS_GATE_STOP( "conv_gen: work" ); 
-
-	return( 0 );
-}
-
-/* INT inner loops.
- */
-#define CONV3x3_INT( TYPE, IM_CLIP ) { \
-	TYPE * restrict p0 = (TYPE *) IM_REGION_ADDR( ir, le, y ); \
-	TYPE * restrict p1 = (TYPE *) IM_REGION_ADDR( ir, le, y + 1 ); \
-	TYPE * restrict p2 = (TYPE *) IM_REGION_ADDR( ir, le, y + 2 ); \
-	TYPE * restrict q = (TYPE *) IM_REGION_ADDR( or, le, y ); \
-	\
-	for( x = 0; x < sz; x++ ) {  \
-		int sum; \
- 		\
-		sum = 0; \
-		sum += m[0] * p0[0]; \
-		sum += m[1] * p0[bands]; \
-		sum += m[2] * p0[bands * 2]; \
-		sum += m[3] * p1[0]; \
-		sum += m[4] * p1[bands]; \
-		sum += m[5] * p1[bands * 2]; \
-		sum += m[6] * p2[0]; \
-		sum += m[7] * p2[bands]; \
-		sum += m[8] * p2[bands * 2]; \
-		\
-		p0 += 1; \
-		p1 += 1; \
-		p2 += 1; \
- 		\
-		sum = ((sum + rounding) / mask->scale) + mask->offset; \
- 		\
-		IM_CLIP; \
-		\
-		q[x] = sum;  \
-	}  \
-} 
-
-/* FLOAT inner loops.
- */
-#define CONV3x3_FLOAT( TYPE ) { \
-	TYPE * restrict p0 = (TYPE *) IM_REGION_ADDR( ir, le, y ); \
-	TYPE * restrict p1 = (TYPE *) IM_REGION_ADDR( ir, le, y + 1 ); \
-	TYPE * restrict p2 = (TYPE *) IM_REGION_ADDR( ir, le, y + 2 ); \
-	TYPE * restrict q = (TYPE *) IM_REGION_ADDR( or, le, y ); \
-	\
-	for( x = 0; x < sz; x++ ) {  \
-		double sum; \
- 		\
-		sum = 0; \
-		sum += m[0] * p0[0]; \
-		sum += m[1] * p0[bands]; \
-		sum += m[2] * p0[bands * 2]; \
-		sum += m[3] * p1[0]; \
-		sum += m[4] * p1[bands]; \
-		sum += m[5] * p1[bands * 2]; \
-		sum += m[6] * p2[0]; \
-		sum += m[7] * p2[bands]; \
-		sum += m[8] * p2[bands * 2]; \
- 		\
-		p0 += 1; \
-		p1 += 1; \
-		p2 += 1; \
- 		\
-		sum = (sum / mask->scale) + mask->offset; \
-		\
-		q[x] = sum;  \
-	}  \
-} 
-
-/* 3x3 masks are very common, so we have a special path for them. This is
- * about 20% faster than the general convolver above.
- */
-static int
-conv3x3_gen( REGION *or, void *vseq, void *a, void *b )
-{
-	ConvSequence *seq = (ConvSequence *) vseq;
-	IMAGE *in = (IMAGE *) a;
-	Conv *conv = (Conv *) b;
-	REGION *ir = seq->ir;
-	INTMASK *mask = conv->mask;
-	int * restrict m = mask->coeff; 
-
-	/* You might think this should be (scale + 1) / 2, but then we'd be 
-	 * adding one for scale == 1.
-	 */
-	int rounding = mask->scale / 2;
-
-	Rect *r = &or->valid;
-	int le = r->left;
-	int to = r->top;
-	int bo = IM_RECT_BOTTOM( r );
-	int sz = IM_REGION_N_ELEMENTS( or ) * (im_iscomplex( in ) ? 2 : 1);
-	int bands = in->Bands;
-
-	Rect s;
-	int x, y;
-
-	/* Prepare the section of the input image we need. A little larger
-	 * than the section of the output image we are producing.
-	 */
-	s = *r;
-	s.width += 2;
-	s.height += 2;
-	if( im_prepare( ir, &s ) )
-		return( -1 );
-
-	VIPS_GATE_START( "conv3x3_gen: work" ); 
-
-	for( y = to; y < bo; y++ ) { 
-		switch( in->BandFmt ) {
-		case IM_BANDFMT_UCHAR: 	
-			CONV3x3_INT( unsigned char, 
-				IM_CLIP_UCHAR( sum, seq ) ); 
-			break;
-
-		case IM_BANDFMT_CHAR:   
-			CONV3x3_INT( signed char, 
-				IM_CLIP_CHAR( sum, seq ) ); 
-			break;
-
-		case IM_BANDFMT_USHORT: 
-			CONV3x3_INT( unsigned short, 
-				IM_CLIP_USHORT( sum, seq ) ); 
-			break;
-
-		case IM_BANDFMT_SHORT:  
-			CONV3x3_INT( signed short, 
-				IM_CLIP_SHORT( sum, seq ) ); 
-			break;
-
-		case IM_BANDFMT_UINT:   
-			CONV3x3_INT( unsigned int, 
-				IM_CLIP_NONE( sum, seq ) ); 
-			break;
-
-		case IM_BANDFMT_INT:    
-			CONV3x3_INT( signed int, 
-				IM_CLIP_NONE( sum, seq ) ); 
-			break;
-
-		case IM_BANDFMT_FLOAT:  
-		case IM_BANDFMT_COMPLEX:  
-			CONV3x3_FLOAT( float ); 
-			break;
-
-		case IM_BANDFMT_DOUBLE: 
-		case IM_BANDFMT_DPCOMPLEX: 
-			CONV3x3_FLOAT( double ); 
-			break;
-
-		default:
-			g_assert_not_reached();
-		}
-	}
-
-	VIPS_GATE_STOP( "conv3x3_gen: work" ); 
-
-	return( 0 );
-}
-
-/* The VipsVector codepath.
- */
-static int
-convvec_gen( REGION *or, void *vseq, void *a, void *b )
-{
-	ConvSequence *seq = (ConvSequence *) vseq;
-	IMAGE *in = (IMAGE *) a;
-	Conv *conv = (Conv *) b;
-	INTMASK *mask = conv->mask;
-	REGION *ir = seq->ir;
-
-	Rect *r = &or->valid;
-	int sz = IM_REGION_N_ELEMENTS( or ) * (im_iscomplex( in ) ? 2 : 1);
-
-	Rect s;
-	int j, y;
-	VipsExecutor convolve[MAX_PASS];
-	VipsExecutor clip;
-
-	/* Prepare the section of the input image we need. A little larger
-	 * than the section of the output image we are producing.
-	 */
-	s = *r;
-	s.width += mask->xsize - 1;
-	s.height += mask->ysize - 1;
-	if( im_prepare( ir, &s ) )
-		return( -1 );
-
-	VIPS_GATE_START( "convvec_gen: work" ); 
-
-	for( j = 0; j < conv->n_pass; j++ ) 
-		vips_executor_set_program( &convolve[j], 
-			conv->pass[j].vector, sz );
-	vips_executor_set_program( &clip, conv->clip, sz );
-
-	for( y = 0; y < r->height; y++ ) { 
-#ifdef DEBUG_PIXELS
-{
-		int h, v;
-
-		printf( "before convolve: %d, %d\n", r->left, r->top + y );
-		for( v = 0; v < mask->ysize; v++ ) {
-			for( h = 0; h < mask->xsize; h++ )
-				printf( "%3d ", *IM_REGION_ADDR( ir, 
-					r->left + h, r->top + y + v ) );
-			printf( "\n" );
-		}
-}
-#endif /*DEBUG_PIXELS*/
-
-		for( j = 0; j < conv->n_pass; j++ ) {
-			/* We always read from t1 and write to t2.
-			 */
-			vips_executor_set_scanline( &convolve[j], 
-				ir, r->left, r->top + y );
-			vips_executor_set_array( &convolve[j],
-				conv->pass[j].r, seq->t1 );
-			vips_executor_set_destination( &convolve[j], seq->t2 );
-			vips_executor_run( &convolve[j] );
-
-			IM_SWAP( void *, seq->t1, seq->t2 );
-		}
-
-#ifdef DEBUG_PIXELS
-		printf( "before clip: %d\n", ((signed short *) seq->t1)[0] );
-#endif /*DEBUG_PIXELS*/
-
-		vips_executor_set_array( &clip, conv->s1, seq->t1 );
-		vips_executor_set_destination( &clip, 
-			IM_REGION_ADDR( or, r->left, r->top + y ) );
-		vips_executor_run( &clip );
-
-#ifdef DEBUG_PIXELS
-		printf( "after clip: %d\n", 
-			*IM_REGION_ADDR( or, r->left, r->top + y ) );
-#endif /*DEBUG_PIXELS*/
-	}
-
-	VIPS_GATE_STOP( "convvec_gen: work" ); 
-
-	return( 0 );
-}
-
-int
-im_conv_raw( IMAGE *in, IMAGE *out, INTMASK *mask )
-{
-	Conv *conv;
-	im_generate_fn generate;
-
-#ifdef DEBUG
-	printf( "im_conv_raw: starting with matrix:\n" );
-	im_print_imask( mask );
-#endif /*DEBUG*/
-
-	/* Check parameters.
-	 */
-	if( im_piocheck( in, out ) ||
-		im_check_uncoded( "im_conv", in ) ||
-		im_check_imask( "im_conv", mask ) ) 
-		return( -1 );
-	if( mask->scale == 0 ) {
-		im_error( "im_conv", "%s", "mask scale must be non-zero" );
-		return( -1 );
-	}
-	if( !(conv = conv_new( in, out, mask )) )
-		return( -1 );
-
-	/* Prepare output. Consider a 7x7 mask and a 7x7 image --- the output
-	 * would be 1x1.
-	 */
-	if( im_cp_desc( out, in ) )
-		return( -1 );
-	out->Xsize -= mask->xsize - 1;
-	out->Ysize -= mask->ysize - 1;
-	if( out->Xsize <= 0 || out->Ysize <= 0 ) {
-		im_error( "im_conv", "%s", _( "image too small for mask" ) );
-		return( -1 );
-	}
-
-	if( conv->n_pass ) {
-		generate = convvec_gen;
-
-#ifdef DEBUG
-		printf( "im_conv_raw: using vector path\n" );
-#endif /*DEBUG*/
-	}
-	else if( mask->xsize == 3 && mask->ysize == 3 ) {
-		generate = conv3x3_gen;
-
-#ifdef DEBUG
-		printf( "im_conv_raw: using 3x3 path\n" );
-#endif /*DEBUG*/
-	}
-	else {
-		generate = conv_gen;
-
-#ifdef DEBUG
-		printf( "im_conv_raw: using general path\n" );
-#endif /*DEBUG*/
-	}
-
-	if( im_demand_hint( out, IM_SMALLTILE, in, NULL ) ||
-		im_generate( out, conv_start, generate, conv_stop, in, conv ) )
-		return( -1 );
-
-	out->Xoffset = -mask->xsize / 2;
-	out->Yoffset = -mask->ysize / 2;
-
-	return( 0 );
-}
-
-int 
-im_conv( IMAGE *in, IMAGE *out, INTMASK *mask )
-{
-	IMAGE *t1 = im_open_local( out, "im_conv intermediate", "p" );
-
-	if( !t1 || 
-		im_embed( in, t1, 1, mask->xsize / 2, mask->ysize / 2, 
-			in->Xsize + mask->xsize - 1, 
-			in->Ysize + mask->ysize - 1 ) ||
-		im_conv_raw( t1, out, mask ) )
-		return( -1 );
-
-	out->Xoffset = 0;
-	out->Yoffset = 0;
-
-	return( 0 );
-}
diff --git a/libvips/convolution/im_conv_f.c b/libvips/convolution/im_conv_f.c
deleted file mode 100644
index 0cbe104b..00000000
--- a/libvips/convolution/im_conv_f.c
+++ /dev/null
@@ -1,389 +0,0 @@
-/* im_conv_f
- *
- * Copyright: 1990, N. Dessipris.
- *
- * Author: Nicos Dessipris & Kirk Martinez
- * Written on: 29/04/1991
- * Modified on: 19/05/1991
- * 8/7/93 JC
- *      - adapted for partial v2
- *      - memory leaks fixed
- *      - ANSIfied
- * 12/7/93 JC
- *	- adapted im_convbi() to im_convbf()
- * 7/10/94 JC
- *	- new IM_ARRAY() macro
- *	- evalend callbacks
- *	- more typedef
- * 9/3/01 JC
- *	- redone from im_conv() 
- * 27/7/01 JC
- *	- rejects masks with scale == 0
- * 7/4/04 
- *	- now uses im_embed() with edge stretching on the input, not
- *	  the output
- *	- sets Xoffset / Yoffset
- * 11/11/05
- * 	- simpler inner loop avoids gcc4 bug 
- * 12/11/09
- * 	- only rebuild the buffer offsets if bpl changes
- * 	- tiny speedups and cleanups
- * 	- add restrict, though it doesn't seem to help gcc
- * 	- add mask-all-zero check
- * 13/11/09
- * 	- rename as im_conv_f() to make it easier for vips.c to make the
- * 	  overloaded version
- * 3/2/10
- * 	- gtkdoc
- * 	- more cleanups
- * 1/10/10
- * 	- support complex (just double the bands)
- * 29/10/10
- * 	- get rid of im_convsep_f(), just call this twice, no longer worth
- * 	  keeping two versions
- * 15/10/11 Nicolas
- * 	- handle offset correctly in seperable convolutions
- * 26/1/16 Lovell Fuller
- * 	- remove Duff for a 25% speedup
- */
-
-/*
-
-    This file is part of VIPS.
-    
-    VIPS is free software; you can redistribute it and/or modify
-    it under the terms of the GNU Lesser General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
-    02110-1301  USA
-
- */
-
-/*
-
-    These files are distributed with VIPS - http://www.vips.ecs.soton.ac.uk
-
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif /*HAVE_CONFIG_H*/
-#include <vips/intl.h>
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <limits.h>
-
-#include <vips/vips.h>
-
-/* Our parameters ... we take a copy of the mask argument, plus we make a
- * smaller version with the zeros squeezed out. 
- */
-typedef struct {
-	IMAGE *in;
-	IMAGE *out;
-	DOUBLEMASK *mask;	/* Copy of mask arg */
-
-	int nnz;		/* Number of non-zero mask elements */
-	double *coeff;		/* Array of non-zero mask coefficients */
-	int *coeff_pos;		/* Index of each nnz element in mask->coeff */
-} Conv;
-
-static int
-conv_close( Conv *conv )
-{
-	IM_FREEF( im_free_dmask, conv->mask );
-
-        return( 0 );
-}
-
-static Conv *
-conv_new( IMAGE *in, IMAGE *out, DOUBLEMASK *mask )
-{
-        Conv *conv = IM_NEW( out, Conv );
-	const int ne = mask->xsize * mask->ysize;
-        int i;
-
-        if( !conv )
-                return( NULL );
-
-        conv->in = in;
-        conv->out = out;
-        conv->mask = NULL;
-        conv->nnz = 0;
-        conv->coeff = NULL;
-
-        if( im_add_close_callback( out, 
-		(im_callback_fn) conv_close, conv, NULL ) ||
-        	!(conv->coeff = IM_ARRAY( out, ne, double )) ||
-        	!(conv->coeff_pos = IM_ARRAY( out, ne, int )) ||
-        	!(conv->mask = im_dup_dmask( mask, "conv_mask" )) )
-                return( NULL );
-
-        /* Find non-zero mask elements.
-         */
-        for( i = 0; i < ne; i++ )
-                if( mask->coeff[i] ) {
-			conv->coeff[conv->nnz] = mask->coeff[i];
-			conv->coeff_pos[conv->nnz] = i;
-			conv->nnz += 1;
-		}
-
-	/* Was the whole mask zero? We must have at least 1 element in there:
-	 * set it to zero.
-	 */
-	if( conv->nnz == 0 ) {
-		conv->coeff[0] = mask->coeff[0];
-		conv->coeff_pos[0] = 0;
-		conv->nnz = 1;
-	}
-
-        return( conv );
-}
-
-/* Our sequence value.
- */
-typedef struct {
-	Conv *conv;
-	REGION *ir;		/* Input region */
-
-	int *offsets;		/* Offsets for each non-zero matrix element */
-	VipsPel **pts;		/* Per-non-zero mask element image pointers */
-
-	int last_bpl;		/* Avoid recalcing offsets, if we can */
-} ConvSequence;
-
-/* Free a sequence value.
- */
-static int
-conv_stop( void *vseq, void *a, void *b )
-{
-	ConvSequence *seq = (ConvSequence *) vseq;
-
-	IM_FREEF( im_region_free, seq->ir );
-
-	return( 0 );
-}
-
-/* Convolution start function.
- */
-static void *
-conv_start( IMAGE *out, void *a, void *b )
-{
-	IMAGE *in = (IMAGE *) a;
-	Conv *conv = (Conv *) b;
-	ConvSequence *seq;
-
-	if( !(seq = IM_NEW( out, ConvSequence )) )
-		return( NULL );
-
-	/* Init!
-	 */
-	seq->conv = conv;
-	seq->ir = NULL;
-	seq->pts = NULL;
-	seq->last_bpl = -1;
-
-	/* Attach region and arrays.
-	 */
-	seq->ir = im_region_create( in );
-	seq->offsets = IM_ARRAY( out, conv->nnz, int );
-	seq->pts = IM_ARRAY( out, conv->nnz, VipsPel * );
-	if( !seq->ir || !seq->offsets || !seq->pts ) {
-		conv_stop( seq, in, conv );
-		return( NULL );
-	}
-
-	return( (void *) seq );
-}
-
-#define CONV_FLOAT( ITYPE, OTYPE ) { \
-	ITYPE ** restrict p = (ITYPE **) seq->pts; \
-	OTYPE * restrict q = (OTYPE *) IM_REGION_ADDR( or, le, y ); \
-	\
-	for( x = 0; x < sz; x++ ) {  \
-		double sum; \
-		int i; \
-		\
-		sum = 0; \
-		for ( i = 0; i < nnz; i++ ) \
-			sum += t[i] * p[i][x]; \
- 		\
-		sum = (sum / mask->scale) + mask->offset; \
-		\
-		q[x] = sum;  \
-	}  \
-} 
-
-/* Convolve!
- */
-static int
-conv_gen( REGION *or, void *vseq, void *a, void *b )
-{
-	ConvSequence *seq = (ConvSequence *) vseq;
-	IMAGE *in = (IMAGE *) a;
-	Conv *conv = (Conv *) b;
-	REGION *ir = seq->ir;
-	DOUBLEMASK *mask = conv->mask;
-	double * restrict t = conv->coeff; 
-	const int nnz = conv->nnz;
-
-	Rect *r = &or->valid;
-	Rect s;
-	int le = r->left;
-	int to = r->top;
-	int bo = IM_RECT_BOTTOM(r);
-	int sz = IM_REGION_N_ELEMENTS( or ) * 
-		(vips_band_format_iscomplex( in->BandFmt ) ? 2 : 1);
-
-	int x, y, z, i;
-
-	/* Prepare the section of the input image we need. A little larger
-	 * than the section of the output image we are producing.
-	 */
-	s = *r;
-	s.width += mask->xsize - 1;
-	s.height += mask->ysize - 1;
-	if( im_prepare( ir, &s ) )
-		return( -1 );
-
-        /* Fill offset array. Only do this if the bpl has changed since the 
-	 * previous im_prepare().
-	 */
-	if( seq->last_bpl != IM_REGION_LSKIP( ir ) ) {
-		seq->last_bpl = IM_REGION_LSKIP( ir );
-
-		for( i = 0; i < nnz; i++ ) {
-			z = conv->coeff_pos[i];
-			x = z % conv->mask->xsize;
-			y = z / conv->mask->xsize;
-
-			seq->offsets[i] = 
-				IM_REGION_ADDR( ir, x + le, y + to ) -
-				IM_REGION_ADDR( ir, le, to );
-		}
-	}
-
-	for( y = to; y < bo; y++ ) { 
-		/* Init pts for this line of PELs.
-		 */
-		for( z = 0; z < nnz; z++ )
-			seq->pts[z] = seq->offsets[z] +
-				IM_REGION_ADDR( ir, le, y ); 
-
-		switch( in->BandFmt ) {
-		case IM_BANDFMT_UCHAR: 	
-			CONV_FLOAT( unsigned char, float ); break;
-		case IM_BANDFMT_CHAR:   
-			CONV_FLOAT( signed char, float ); break;
-		case IM_BANDFMT_USHORT: 
-			CONV_FLOAT( unsigned short, float ); break;
-		case IM_BANDFMT_SHORT:  
-			CONV_FLOAT( signed short, float ); break;
-		case IM_BANDFMT_UINT:   
-			CONV_FLOAT( unsigned int, float ); break;
-		case IM_BANDFMT_INT:    
-			CONV_FLOAT( signed int, float ); break;
-		case IM_BANDFMT_FLOAT:  
-		case IM_BANDFMT_COMPLEX:  
-			CONV_FLOAT( float, float ); break;
-		case IM_BANDFMT_DOUBLE: 
-		case IM_BANDFMT_DPCOMPLEX:  
-			CONV_FLOAT( double, double ); break;
-
-		default:
-			g_assert_not_reached();
-		}
-	}
-
-	return( 0 );
-}
-
-int
-im_conv_f_raw( IMAGE *in, IMAGE *out, DOUBLEMASK *mask )
-{
-	Conv *conv;
-
-	/* Check parameters.
-	 */
-	if( im_piocheck( in, out ) ||
-		im_check_uncoded( "im_conv", in ) ||
-		im_check_dmask( "im_conv", mask ) ) 
-		return( -1 );
-	if( mask->scale == 0 ) {
-		im_error( "im_conv_f", "%s", "mask scale must be non-zero" );
-		return( -1 );
-	}
-	if( !(conv = conv_new( in, out, mask )) )
-		return( -1 );
-
-	/* Prepare output. Consider a 7x7 mask and a 7x7 image --- the output
-	 * would be 1x1.
-	 */
-	if( im_cp_desc( out, in ) )
-		return( -1 );
-	if( vips_bandfmt_isint( in->BandFmt ) ) 
-		out->BandFmt = IM_BANDFMT_FLOAT;
-	out->Xsize -= mask->xsize - 1;
-	out->Ysize -= mask->ysize - 1;
-	if( out->Xsize <= 0 || out->Ysize <= 0 ) {
-		im_error( "im_conv_f", "%s", _( "image too small for mask" ) );
-		return( -1 );
-	}
-
-	if( im_demand_hint( out, IM_SMALLTILE, in, NULL ) )
-		return( -1 );
-
-	if( im_generate( out, conv_start, conv_gen, conv_stop, in, conv ) )
-		return( -1 );
-
-	out->Xoffset = -mask->xsize / 2;
-	out->Yoffset = -mask->ysize / 2;
-
-	return( 0 );
-}
-
-/**
- * im_conv_f:
- * @in: input image
- * @out: output image
- * @mask: convolution mask
- *
- * Convolve @in with @mask using floating-point arithmetic. The output image 
- * is always %IM_BANDFMT_FLOAT unless @in is %IM_BANDFMT_DOUBLE, in which case
- * @out is also %IM_BANDFMT_DOUBLE. 
- *
- * Each output pixel is
- * calculated as sigma[i]{pixel[i] * mask[i]} / scale + offset, where scale
- * and offset are part of @mask. 
- *
- * See also: im_conv(), im_convsep_f(), im_create_dmaskv().
- *
- * Returns: 0 on success, -1 on error
- */
-int 
-im_conv_f( IMAGE *in, IMAGE *out, DOUBLEMASK *mask )
-{
-	IMAGE *t1 = im_open_local( out, "im_conv_f intermediate", "p" );
-
-	if( !t1 || 
-		im_embed( in, t1, 1, mask->xsize / 2, mask->ysize / 2, 
-			in->Xsize + mask->xsize - 1, 
-			in->Ysize + mask->ysize - 1 ) ||
-		im_conv_f_raw( t1, out, mask ) )
-		return( -1 );
-
-	out->Xoffset = 0;
-	out->Yoffset = 0;
-
-	return( 0 );
-}
diff --git a/libvips/create/worley.c b/libvips/create/worley.c
index dc509731..514b7258 100644
--- a/libvips/create/worley.c
+++ b/libvips/create/worley.c
@@ -1,6 +1,9 @@
 /* Worley noise generator.
  *
  * 19/7/16
+ *
+ * 11/8/16
+ * 	- float output
  */
 
 /*
@@ -215,7 +218,7 @@ vips_worley_start( VipsImage *out, void *a, void *b )
 	return( seq );
 }
 
-static int
+static float
 vips_hypot( int x, int y )
 {
 	/* Faster than hypot() for int args.
@@ -223,10 +226,10 @@ vips_hypot( int x, int y )
 	return( sqrt( x * x + y * y ) );
 }
 
-static int
+static float
 vips_worley_distance( VipsWorley *worley, Cell cells[9], int x, int y )
 {
-	int distance;
+	float distance;
 
 	int i, j;
 
@@ -236,7 +239,7 @@ vips_worley_distance( VipsWorley *worley, Cell cells[9], int x, int y )
 		Cell *cell = &cells[i];
 
 		for( j = 0; j < cell->n_features; j++ ) {
-			int d = vips_hypot( 
+			float d = vips_hypot( 
 				x - cell->feature_x[j], 
 				y - cell->feature_y[j] );
 
@@ -258,7 +261,7 @@ vips_worley_gen( VipsRegion *or, void *vseq, void *a, void *b,
 	int x, y;
 
 	for( y = 0; y < r->height; y++ ) {
-		int *q = (int *) VIPS_REGION_ADDR( or, r->left, r->top + y );
+		float *q = (float *) VIPS_REGION_ADDR( or, r->left, r->top + y );
 
 		for( x = 0; x < r->width; x++ ) {
 			int cell_x = (r->left + x) / worley->cell_size;
@@ -300,7 +303,7 @@ vips_worley_build( VipsObject *object )
 
 	vips_image_init_fields( create->out,
 		worley->width, worley->height, 1,
-		VIPS_FORMAT_INT, VIPS_CODING_NONE, VIPS_INTERPRETATION_B_W,
+		VIPS_FORMAT_FLOAT, VIPS_CODING_NONE, VIPS_INTERPRETATION_B_W,
 		1.0, 1.0 );
 	vips_image_pipelinev( create->out,
 		VIPS_DEMAND_STYLE_ANY, NULL );
@@ -365,7 +368,7 @@ vips_worley_init( VipsWorley *worley )
  *
  * * @cell_size: %gint, size of Worley cells
  *
- * Create a one-band int image of Worley noise. See:
+ * Create a one-band float image of Worley noise. See:
  *
  * https://en.wikipedia.org/wiki/Worley_noise
  *
diff --git a/libvips/deprecated/vips7compat.c b/libvips/deprecated/vips7compat.c
index 18713e38..10eec9d0 100644
--- a/libvips/deprecated/vips7compat.c
+++ b/libvips/deprecated/vips7compat.c
@@ -2404,6 +2404,116 @@ im_convsep_f( IMAGE *in, IMAGE *out, DOUBLEMASK *mask )
 	return( 0 );
 }
 
+int 
+im_conv( VipsImage *in, VipsImage *out, INTMASK *mask )
+{
+	VipsImage *t1, *t2;
+
+	if( !(t1 = vips_image_new()) ||
+		im_imask2vips( mask, t1 ) )
+		return( -1 );
+	if( vips_convi( in, &t2, t1, 
+		NULL ) ) {
+		g_object_unref( t1 );
+		return( -1 );
+	}
+	g_object_unref( t1 );
+	if( vips_image_write( t2, out ) ) {
+		g_object_unref( t2 );
+		return( -1 );
+	}
+	g_object_unref( t2 );
+
+	return( 0 );
+}
+
+int
+im_conv_raw( VipsImage *in, VipsImage *out, INTMASK *mask )
+{
+	im_error( "im_conv_raw", "no compat function" );
+	return( -1 );
+}
+
+int 
+im_conv_f( VipsImage *in, VipsImage *out, DOUBLEMASK *mask )
+{
+	VipsImage *t1, *t2;
+
+	if( !(t1 = vips_image_new()) ||
+		im_mask2vips( mask, t1 ) )
+		return( -1 );
+	if( vips_convf( in, &t2, t1, 
+		NULL ) ) {
+		g_object_unref( t1 );
+		return( -1 );
+	}
+	g_object_unref( t1 );
+	if( vips_image_write( t2, out ) ) {
+		g_object_unref( t2 );
+		return( -1 );
+	}
+	g_object_unref( t2 );
+
+	return( 0 );
+}
+
+int 
+im_aconvsep( VipsImage *in, VipsImage *out, DOUBLEMASK *mask, int n_layers )
+{
+	VipsImage *t1, *t2;
+
+	if( !(t1 = vips_image_new()) ||
+		im_mask2vips( mask, t1 ) )
+		return( -1 );
+	if( vips_convasep( in, &t2, t1, 
+		"layers", n_layers,
+		NULL ) ) {
+		g_object_unref( t1 );
+		return( -1 );
+	}
+	g_object_unref( t1 );
+	if( vips_image_write( t2, out ) ) {
+		g_object_unref( t2 );
+		return( -1 );
+	}
+	g_object_unref( t2 );
+
+	return( 0 );
+}
+
+int 
+im_aconv( VipsImage *in, VipsImage *out, 
+	DOUBLEMASK *mask, int n_layers, int cluster )
+{
+	VipsImage *t1, *t2;
+
+	if( !(t1 = vips_image_new()) ||
+		im_mask2vips( mask, t1 ) )
+		return( -1 );
+	if( vips_conva( in, &t2, t1, 
+		"layers", n_layers,
+		"cluster", cluster,
+		NULL ) ) {
+		g_object_unref( t1 );
+		return( -1 );
+	}
+	g_object_unref( t1 );
+	if( vips_image_write( t2, out ) ) {
+		g_object_unref( t2 );
+		return( -1 );
+	}
+	g_object_unref( t2 );
+
+	return( 0 );
+}
+
+int
+im_conv_f_raw( VipsImage *in, VipsImage *out, DOUBLEMASK *mask )
+{
+	im_error( "im_conv_f_raw", "no compat function" );
+	return( -1 );
+}
+
 int
 im_addgnoise( IMAGE *in, IMAGE *out, double sigma )
 {
diff --git a/libvips/foreign/csv.c b/libvips/foreign/csv.c
index 2e35cf16..f72d76f1 100644
--- a/libvips/foreign/csv.c
+++ b/libvips/foreign/csv.c
@@ -30,6 +30,8 @@
  * 4/6/15
  * 	- try to support DOS files under linux ... we have to look for \r\n
  * 	  linebreaks
+ * 12/8/16
+ * 	- allow missing offset in matrix header
  */
 
 /*
@@ -522,6 +524,8 @@ read_ascii_double( FILE *fp, const char whitemap[256], double *out )
 
 /* Read the header. Two numbers for width and height, and two optional
  * numbers for scale and offset. 
+ *
+ * We can have scale and no offset, in which case we assume offset = 0.
  */
 static int
 vips__matrix_header( char *whitemap, FILE *fp,
@@ -532,6 +536,11 @@ vips__matrix_header( char *whitemap, FILE *fp,
 	int i;
 	int ch;
 
+	/* Offset defaults to zero.
+	 */
+	header[2] = 1.0;
+	header[3] = 0.0;
+
 	for( i = 0; i < 4 && 
 		(ch = read_ascii_double( fp, whitemap, &header[i] )) == 0; 
 		i++ )
@@ -556,22 +565,17 @@ vips__matrix_header( char *whitemap, FILE *fp,
 			"%s", _( "width / height out of range" ) );
 		return( -1 );
 	}
-	if( i == 3 ) { 
-		vips_error( "mask2vips", "%s", _( "bad scale / offset" ) );
-		return( -1 );
-	}
 	if( (ch = read_ascii_double( fp, whitemap, &d )) != '\n' ) {
 		vips_error( "mask2vips", "%s", _( "extra chars in header" ) );
 		return( -1 );
 	}
-	if( i > 2 && 
-		header[2] == 0.0 ) {
+	if( header[2] == 0.0 ) {
 		vips_error( "mask2vips", "%s", _( "zero scale" ) );
 		return( -1 );
 	}
 
-	*scale = i > 2 ?  header[2] : 1.0;
-	*offset = i > 2 ?  header[3] : 0.0;
+	*scale = header[2];
+	*offset = header[3];
 
 	skip_line( fp );
 
diff --git a/libvips/include/vips/convolution.h b/libvips/include/vips/convolution.h
index 1240301e..a79df95b 100644
--- a/libvips/include/vips/convolution.h
+++ b/libvips/include/vips/convolution.h
@@ -31,8 +31,8 @@
 
  */
 
-#ifndef IM_CONVOLUTION_H
-#define IM_CONVOLUTION_H
+#ifndef VIPS_CONVOLUTION_H
+#define VIPS_CONVOLUTION_H
 
 #ifdef __cplusplus
 extern "C" {
@@ -46,15 +46,23 @@ typedef enum {
 
 int vips_conv( VipsImage *in, VipsImage **out, VipsImage *mask, ... )
 	__attribute__((sentinel));
-int vips_compass( VipsImage *in, VipsImage **out, VipsImage *mask, ... )
+int vips_convf( VipsImage *in, VipsImage **out, VipsImage *mask, ... )
+	__attribute__((sentinel));
+int vips_convi( VipsImage *in, VipsImage **out, VipsImage *mask, ... )
+	__attribute__((sentinel));
+int vips_conva( VipsImage *in, VipsImage **out, VipsImage *mask, ... )
 	__attribute__((sentinel));
 int vips_convsep( VipsImage *in, VipsImage **out, VipsImage *mask, ... )
 	__attribute__((sentinel));
+int vips_convasep( VipsImage *in, VipsImage **out, VipsImage *mask, ... )
+	__attribute__((sentinel));
 
-int vips_sharpen( VipsImage *in, VipsImage **out, ... ) 
+int vips_compass( VipsImage *in, VipsImage **out, VipsImage *mask, ... )
 	__attribute__((sentinel));
 int vips_gaussblur( VipsImage *in, VipsImage **out, double sigma, ... )
 	__attribute__((sentinel));
+int vips_sharpen( VipsImage *in, VipsImage **out, ... ) 
+	__attribute__((sentinel));
 
 int vips_spcor( VipsImage *in, VipsImage *ref, VipsImage **out, ... )
 	__attribute__((sentinel));
@@ -65,4 +73,4 @@ int vips_fastcor( VipsImage *in, VipsImage *ref, VipsImage **out, ... )
 }
 #endif /*__cplusplus*/
 
-#endif /*IM_CONVOLUTION_H*/
+#endif /*VIPS_CONVOLUTION_H*/
diff --git a/libvips/include/vips/internal.h b/libvips/include/vips/internal.h
index bc832d2a..6d32afa2 100644
--- a/libvips/include/vips/internal.h
+++ b/libvips/include/vips/internal.h
@@ -246,11 +246,12 @@ typedef struct _VipsImagePixels {
 	gint64 npels;		/* Number of pels calculated so far */
 } VipsImagePixels;
 
-int
-vips__foreign_convert_saveable( VipsImage *in, VipsImage **ready,
+int vips__foreign_convert_saveable( VipsImage *in, VipsImage **ready,
 	VipsSaveable saveable, VipsBandFormat *format, VipsCoding *coding,
 	VipsArrayDouble *background );
 
+int vips__image_intize( VipsImage *in, VipsImage **out );
+
 #ifdef __cplusplus
 }
 #endif /*__cplusplus*/
diff --git a/libvips/include/vips/vector.h b/libvips/include/vips/vector.h
index 8ce714cb..88e7eb20 100644
--- a/libvips/include/vips/vector.h
+++ b/libvips/include/vips/vector.h
@@ -136,6 +136,8 @@ void vips_executor_set_array( VipsExecutor *executor, int var, void *value );
 
 void vips_executor_run( VipsExecutor *executor ); 
 
+void vips_vector_to_fixed_point( double *in, int *out, int n, int scale );
+
 #ifdef __cplusplus
 }
 #endif /*__cplusplus*/
diff --git a/libvips/iofuncs/memory.c b/libvips/iofuncs/memory.c
index 21a11741..ca5e2cca 100644
--- a/libvips/iofuncs/memory.c
+++ b/libvips/iofuncs/memory.c
@@ -109,7 +109,15 @@ static GMutex *vips_tracked_mutex = NULL;
  * @OBJ: allocate memory local to @OBJ, or %NULL for no auto-free
  * @T: type of thing to allocate
  *
- * Returns: A pointer of type @T *, or %NULL on error.
+ * Allocate memory for a thing of type @T. The memory is not
+ * cleared. 
+ * 
+ * This macro cannot fail. See vips_tracked_malloc() if you are 
+ * allocating large amounts of memory.
+ *
+ * See also: vips_malloc().
+ *
+ * Returns: A pointer of type @T *.
  */
 
 /**
@@ -118,7 +126,15 @@ static GMutex *vips_tracked_mutex = NULL;
  * @N: number of @T 's to allocate
  * @T: type of thing to allocate
  *
- * Returns: A pointer of type @T *, or %NULL on error.
+ * Allocate memory for an array of objects of type @T. The memory is not
+ * cleared. 
+ *
+ * This macro cannot fail. See vips_tracked_malloc() if you are 
+ * allocating large amounts of memory.
+ *
+ * See also: vips_malloc().
+ *
+ * Returns: A pointer of type @T *.
  */
 
 static void
@@ -141,7 +157,7 @@ vips_malloc_cb( VipsObject *object, char *buf )
  *
  * See also: vips_tracked_malloc().
  *
- * Returns: (transfer full): a pointer to the allocated memory
+ * Returns: (transfer full): a pointer to the allocated memory.
  */
 void *
 vips_malloc( VipsObject *object, size_t size )
@@ -166,7 +182,7 @@ vips_malloc( VipsObject *object, size_t size )
  *
  * g_strdup() a string. When @object is freed, the string will be freed for
  * you.  If @object is %NULL, you need to 
- * free the memory explicitly with g_free().
+ * free the memory yourself with g_free().
  *
  * This function cannot fail. 
  *
diff --git a/libvips/iofuncs/vector.c b/libvips/iofuncs/vector.c
index b96ac01f..f405c3ad 100644
--- a/libvips/iofuncs/vector.c
+++ b/libvips/iofuncs/vector.c
@@ -522,3 +522,84 @@ vips_executor_run( VipsExecutor *executor )
 	orc_executor_run( &executor->executor );
 #endif /*HAVE_ORC*/
 }
+
+/* Make a fixed-point version of a matrix. Each 
+ * out[i] = rint(in[i] * adj_scale), where adj_scale is selected so that 
+ * sum(out) = sum(in) * scale.
+ *
+ * Because of the vagaries of rint(), we can't just calc this, we have to
+ * iterate and converge on the best value for adj_scale.
+ */
+void
+vips_vector_to_fixed_point( double *in, int *out, int n, int scale )
+{
+	double fsum;
+	int i;
+	int target;
+	int sum;
+	double high;
+	double low;
+	double guess;
+
+	fsum = 0.0;
+	for( i = 0; i < n; i++ )
+		fsum += in[i];
+	target = VIPS_RINT( fsum * scale );
+
+	/* As we rint() each scale element, we can get up to 0.5 error.
+	 * Therefore, by the end of the mask, we can be off by up to n/2. Our
+	 * high and low guesses are therefore n/2 either side of the obvious
+	 * answer.
+	 */
+	high = scale + (n + 1) / 2;
+	low = scale - (n + 1) / 2;
+
+	do {
+		guess = (high + low) / 2.0;
+
+		for( i = 0; i < n; i++ ) 
+			out[i] = VIPS_RINT( in[i] * guess );
+
+		sum = 0;
+		for( i = 0; i < n; i++ )
+			sum += out[i];
+
+		if( sum == target )
+			break;
+		if( sum < target )
+			low = guess;
+		if( sum > target )
+			high = guess;
+
+	/* This will typically produce about 5 iterations.
+	 */
+	} while( high - low > 0.01 );
+
+	if( sum != target ) {
+		/* Spread the error out thinly over the whole array. For
+		 * example, consider the matrix:
+		 *
+		 * 	3 3 9 0
+		 *	1 1 1
+		 *	1 1 1
+		 *	1 1 1
+		 *
+		 * being converted with scale = 64 (convi does this). We want
+		 * to generate a mix of 7s and 8s. 
+		 */
+		int each_error = (target - sum) / n;
+		int extra_error = (target - sum) % n;
+
+		/* To share the residual error, we add or subtract 1 from the
+		 * first abs(extra_error) elements.
+		 */
+		int direction = extra_error > 0 ? 1 : -1;
+		int n_elements = VIPS_ABS( extra_error );
+
+		for( i = 0; i < n; i++ )
+			out[i] += each_error;
+
+		for( i = 0; i < n_elements; i++ )
+			out[i] += direction;
+	}
+}
diff --git a/libvips/resample/reducev.cpp b/libvips/resample/reducev.cpp
index 760b3b0b..362ca861 100644
--- a/libvips/resample/reducev.cpp
+++ b/libvips/resample/reducev.cpp
@@ -702,65 +702,6 @@ vips_reducev_vector_gen( VipsRegion *out_region, void *vseq,
 	return( 0 );
 }
 
-/* Make a fixed-point version of a mask. Each out[i] = rint(in[i] * adj_scale), 
- * where adj_scale is selected so that sum(out) = sum(in) * scale.
- *
- * Because of the vagaries of rint(), we can't just calc this, we have to
- * iterate and converge on the best value for adj_scale.
- */
-static void
-vips_reducev_intize( double *in, int *out, int n, int scale )
-{
-	double fsum;
-	int target;
-	int sum;
-	double high;
-	double low;
-	double guess;
-
-	fsum = 0.0;
-	for( int i = 0; i < n; i++ )
-		fsum += in[i];
-	target = VIPS_RINT( fsum * scale );
-
-	/* As we rint() each scale element, we can get up to 0.5 error.
-	 * Therefore, by the end of the mask, we can be off by up to n/2. Our
-	 * high and low guesses are therefore n/2 either side of the obvious
-	 * answer.
-	 */
-	high = scale + (n + 1) / 2;
-	low = scale - (n + 1) / 2;
-
-	do {
-		guess = (high + low) / 2.0;
-
-		for( int i = 0; i < n; i++ ) 
-			out[i] = VIPS_RINT( in[i] * guess );
-
-		sum = 0;
-		for( int i = 0; i < n; i++ )
-			sum += out[i];
-
-		if( sum == target )
-			break;
-		if( sum < target )
-			low = guess;
-		if( sum > target )
-			high = guess;
-
-	/* This will typically produce about 5 iterations.
-	 */
-	} while( high - low > 0.01 );
-
-	if( sum != target ) 
-		/* We're as close as we can get ... add the remaining error to
-		 * the centre element. Hopefully we'll get slight sharpness 
-		 * changes rather than slight brightness changes and it'll
-		 * be less visible. 
-		 */
-		out[n / 2] += target - sum;
-}
-
 static int
 vips_reducev_raw( VipsReducev *reducev, VipsImage *in ) 
 {
@@ -798,7 +739,7 @@ vips_reducev_raw( VipsReducev *reducev, VipsImage *in )
 			if( !reducev->matrixi[y] )
 				return( -1 ); 
 
-			vips_reducev_intize( 
+			vips_vector_to_fixed_point( 
 				reducev->matrixf[y], reducev->matrixi[y], 
 				reducev->n_point, VIPS_INTERPOLATE_SCALE );
 		}
@@ -813,7 +754,7 @@ vips_reducev_raw( VipsReducev *reducev, VipsImage *in )
 			if( !reducev->matrixo[y] )
 				return( -1 ); 
 
-			vips_reducev_intize( 
+			vips_vector_to_fixed_point( 
 				reducev->matrixf[y], reducev->matrixo[y], 
 				reducev->n_point, 64 );
 		}
diff --git a/test/test_convolution.py b/test/test_convolution.py
index 03e62651..ea516db6 100755
--- a/test/test_convolution.py
+++ b/test/test_convolution.py
@@ -96,6 +96,12 @@ class TestConvolution(unittest.TestCase):
         for x, y in zip_expand(a, b):
             self.assertAlmostEqual(x, y, places = places, msg = msg)
 
+    # test a pair of things which can be lists for difference less than a
+    # threshold
+    def assertLessThreshold(self, a, b, diff):
+        for x, y in zip_expand(a, b):
+            self.assertLess(abs(x - y), diff)
+
     def setUp(self):
         im = Vips.Image.mask_ideal(100, 100, 0.5, reject = True, optical = True)
         self.colour = im * [1, 2, 3] + [2, 3, 4]
@@ -131,6 +137,26 @@ class TestConvolution(unittest.TestCase):
                     true = conv(im, msk, 49, 49)
                     self.assertAlmostEqualObjects(result, true)
 
+    # don't test conva, it's still not done
+    def dont_test_conva(self):
+        for im in self.all_images:
+            for msk in self.all_masks:
+                print("msk:")
+                msk.matrixprint()
+                print("im.bands = %s" % im.bands)
+
+                convolved = im.conv(msk, precision = Vips.Precision.APPROXIMATE)
+
+                result = convolved(25, 50)
+                true = conv(im, msk, 24, 49)
+                print("result = %s, true = %s" % (result, true))
+                self.assertLessThreshold(result, true, 5)
+
+                result = convolved(50, 50)
+                true = conv(im, msk, 49, 49)
+                print("result = %s, true = %s" % (result, true))
+                self.assertLessThreshold(result, true, 5)
+
     def test_compass(self):
         for im in self.all_images:
             for msk in self.all_masks:
diff --git a/test/test_create.py b/test/test_create.py
index 861ee07d..9641efa8 100755
--- a/test/test_create.py
+++ b/test/test_create.py
@@ -446,7 +446,7 @@ class TestCreate(unittest.TestCase):
         self.assertEqual(im.width, 512)
         self.assertEqual(im.height, 512)
         self.assertEqual(im.bands, 1)
-        self.assertEqual(im.format, Vips.BandFormat.INT)
+        self.assertEqual(im.format, Vips.BandFormat.FLOAT)
 
     def test_perlin(self):
         im = Vips.Image.perlin(512, 512)