diff --git a/ChangeLog b/ChangeLog
index b4324789..609a5217 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -39,6 +39,8 @@
 - mask gtk-doc done
 - add cfitsio dependancy
 - add FITS reader
+- land the vector branmch ... we have SSE erode/dilate/add/conv
+- add IM_SWAP
 
 12/5/10 started 7.22.2
 - the conditional image of ifthenelse can be any format, a (!=0) is added if
diff --git a/TODO b/TODO
index da6942f8..e3b58958 100644
--- a/TODO
+++ b/TODO
@@ -1,4 +1,9 @@
-- add IM_SWAP, see orc branch util.h
+- lab [100,0,0] -> srgb [255, 255, 254]? how odd
+
+- scrap im_convsep_f() ... just use im_conv_f() twice
+
+- make im_rank() use IM_SWAP()
+
 
 
 - test fits reader more ... colour?
diff --git a/configure.in b/configure.in
index 6eabda21..4982de2a 100644
--- a/configure.in
+++ b/configure.in
@@ -375,6 +375,21 @@ if test x"$with_magick" != "xno"; then
   LIBS=$save_LIBS
 fi
 
+# orc
+AC_ARG_WITH([orc], 
+  AS_HELP_STRING([--without-orc], [build without orc (default: test)]))
+
+if test x"$with_orc" != "xno"; then
+  # we use loadpw etc.
+  PKG_CHECK_MODULES(ORC, orc-0.4 >= 0.4.11,
+    [AC_DEFINE(HAVE_ORC,1,[define if you have orc-0.4.11 or later installed.])
+     with_orc=yes
+     PACKAGES_USED="$PACKAGES_USED orc-0.4"],
+    [AC_MSG_WARN([orc-0.4.11 or later not found; disabling orc support])
+     with_orc=no
+    ])
+fi
+
 # lcms ... look for lcms2 first, it has better threading support
 AC_ARG_WITH([lcms], 
   AS_HELP_STRING([--without-lcms], [build without lcms (default: test)]))
@@ -570,14 +585,14 @@ fi
 # Gather all up for VIPS_CFLAGS, VIPS_INCLUDES, VIPS_LIBS and VIPS_CXX_LIBS
 # sort includes to get longer, more specific dirs first
 # helps, for example, selecting graphicsmagick over imagemagick
-VIPS_CFLAGS=`for i in $VIPS_CFLAGS $GTHREAD_CFLAGS $REQUIRED_CFLAGS $PANGOFT2_CFLAGS $FFTW3_CFLAGS $MAGICK_CFLAGS $PNG_CFLAGS $EXIF_CFLAGS $MATIO_CFLAGS $CFITSIO_CFLAGS $OPENEXR_CFLAGS  
+VIPS_CFLAGS=`for i in $VIPS_CFLAGS $GTHREAD_CFLAGS $REQUIRED_CFLAGS $PANGOFT2_CFLAGS $FFTW3_CFLAGS $MAGICK_CFLAGS $PNG_CFLAGS $EXIF_CFLAGS $MATIO_CFLAGS $CFITSIO_CFLAGS $OPENEXR_CFLAGS $ORC_CFLAGS
 do 
   echo $i 
 done | sort -ru`
 VIPS_CFLAGS=`echo $VIPS_CFLAGS`
 VIPS_CFLAGS="$VIPS_DEBUG_FLAGS $VIPS_CFLAGS"
 VIPS_INCLUDES="$PNG_INCLUDES $TIFF_INCLUDES $ZIP_INCLUDES $JPEG_INCLUDES $FFTW_INCLUDES $LCMS_INCLUDES" 
-VIPS_LIBS="$MAGICK_LIBS $PNG_LIBS $TIFF_LIBS $ZIP_LIBS $JPEG_LIBS $GTHREAD_LIBS $REQUIRED_LIBS $PANGOFT2_LIBS $FFTW3_LIBS $FFTW_LIBS $LCMS_LIBS $OPENEXR_LIBS $CFITSIO_LIBS $MATIO_LIBS $EXIF_LIBS -lm"
+VIPS_LIBS="$MAGICK_LIBS $PNG_LIBS $TIFF_LIBS $ZIP_LIBS $JPEG_LIBS $GTHREAD_LIBS $REQUIRED_LIBS $PANGOFT2_LIBS $FFTW3_LIBS $FFTW_LIBS $ORC_LIBS $LCMS_LIBS $OPENEXR_LIBS $CFITSIO_LIBS $MATIO_LIBS $EXIF_LIBS -lm"
 # need -lstdc++ for (eg.) the C++ format loaders
 VIPS_CXX_LIBS="-lstdc++"
 
@@ -660,6 +675,8 @@ build docs with gtkdoc 			$enable_gtk_doc
 use fftw3 for FFT: 			$with_fftw3
 Magick package: 			$with_magickpackage
 file import with libMagick: 		$with_magick
+accelerate loops with orc: 		$with_orc
+  (needs orc-0.4.11 or later)
 ICC profile support with lcms: 		$with_lcms (version $with_lcms_ver)
 file import with OpenEXR: 		$with_OpenEXR
 file import with matio: 		$with_matio
diff --git a/libvips/arithmetic/im_add.c b/libvips/arithmetic/im_add.c
index bf94ed42..5252a754 100644
--- a/libvips/arithmetic/im_add.c
+++ b/libvips/arithmetic/im_add.c
@@ -30,6 +30,8 @@
  * 	- more of operation scaffold moved inside
  * 25/7/10
  * 	- remove oil support again ... we'll try Orc instead
+ * 29/10/10
+ * 	- move to VipsVector for Orc support
  */
 
 /*
@@ -69,6 +71,7 @@
 
 #include <vips/vips.h>
 #include <vips/internal.h>
+#include <vips/vector.h>
 
 #ifdef WITH_DMALLOC
 #include <dmalloc.h>
@@ -83,6 +86,8 @@
 		q[x] = p1[x] + p2[x]; \
 }
 
+static VipsVector *add_vectors[IM_BANDFMT_LAST] = { NULL };
+
 static void
 add_buffer( PEL **in, PEL *out, int width, IMAGE *im )
 {
@@ -91,32 +96,49 @@ add_buffer( PEL **in, PEL *out, int width, IMAGE *im )
 	const int sz = width * im->Bands * 
 		(vips_bandfmt_iscomplex( im->BandFmt ) ? 2 : 1);
 
-	int x;
+	if( vips_vector_get_enabled() && 
+		add_vectors[im->BandFmt] ) {
+		VipsExecutor ex;
 
-	/* Add all input types. Keep types here in sync with bandfmt_add[] 
-	 * below.
-         */
-        switch( im->BandFmt ) {
-        case IM_BANDFMT_UCHAR: 	LOOP( unsigned char, unsigned short ); break; 
-        case IM_BANDFMT_CHAR: 	LOOP( signed char, signed short ); break; 
-        case IM_BANDFMT_USHORT: LOOP( unsigned short, unsigned int ); break; 
-        case IM_BANDFMT_SHORT: 	LOOP( signed short, signed int ); break; 
-        case IM_BANDFMT_UINT: 	LOOP( unsigned int, unsigned int ); break; 
-        case IM_BANDFMT_INT: 	LOOP( signed int, signed int ); break; 
+		vips_executor_set_program( &ex, add_vectors[im->BandFmt], sz );
+		vips_executor_set_source( &ex, 1, in[0] );
+		vips_executor_set_source( &ex, 2, in[1] );
+		vips_executor_set_destination( &ex, out );
 
-        case IM_BANDFMT_FLOAT: 		
-        case IM_BANDFMT_COMPLEX:
-		LOOP( float, float ); 
-		break; 
+		vips_executor_run( &ex );
+	}
+	else {
+		int x;
 
-        case IM_BANDFMT_DOUBLE:	
-        case IM_BANDFMT_DPCOMPLEX:
-		LOOP( double, double ); 
-		break;
+		/* Add all input types. Keep types here in sync with 
+		 * bandfmt_add[] below.
+		 */
+		switch( im->BandFmt ) {
+		case IM_BANDFMT_UCHAR: 	
+			LOOP( unsigned char, unsigned short ); break; 
+		case IM_BANDFMT_CHAR: 	
+			LOOP( signed char, signed short ); break; 
+		case IM_BANDFMT_USHORT: 
+			LOOP( unsigned short, unsigned int ); break; 
+		case IM_BANDFMT_SHORT: 	
+			LOOP( signed short, signed int ); break; 
+		case IM_BANDFMT_UINT: 	
+			LOOP( unsigned int, unsigned int ); break; 
+		case IM_BANDFMT_INT: 	
+			LOOP( signed int, signed int ); break; 
 
-        default:
-		g_assert( 0 );
-        }
+		case IM_BANDFMT_FLOAT: 		
+		case IM_BANDFMT_COMPLEX: 
+			LOOP( float, float ); break; 
+
+		case IM_BANDFMT_DOUBLE:	
+		case IM_BANDFMT_DPCOMPLEX: 
+			LOOP( double, double ); break;
+
+		default:
+			g_assert( 0 );
+		}
+	}
 }
 
 /* Save a bit of typing.
@@ -311,6 +333,106 @@ static int bandfmt_add[10] = {
    US, S,  UI, I,  UI, I, F, X, D, DX
 };
 
+void
+im__init_programs( VipsVector *vectors[IM_BANDFMT_LAST], 
+	int format_table[IM_BANDFMT_LAST] )
+{
+	int fmt;
+
+	for( fmt = 0; fmt < IM_BANDFMT_LAST; fmt++ ) {
+		int isize = im__sizeof_bandfmt[fmt];
+		int osize = im__sizeof_bandfmt[format_table[fmt]];
+
+		char source[256];
+		VipsVector *v;
+
+		/* float and double are not handled (well) by ORC.
+		 */
+		if( fmt == IM_BANDFMT_DOUBLE ||	
+			fmt == IM_BANDFMT_FLOAT ||	
+			fmt == IM_BANDFMT_COMPLEX ||
+			fmt == IM_BANDFMT_DPCOMPLEX )
+			continue;
+
+		v = vectors[fmt] = 
+			vips_vector_new_ds( "binary arith", osize, isize );
+		vips_vector_source( v, source, 2, isize );
+
+		vips_vector_temporary( v, "t1", osize );
+		vips_vector_temporary( v, "t2", osize );
+	}
+}
+
+void
+im__compile_programs( VipsVector *vectors[IM_BANDFMT_LAST] )
+{
+	int fmt;
+
+	for( fmt = 0; fmt < IM_BANDFMT_LAST; fmt++ ) {
+		if( vectors[fmt] &&
+			!vips_vector_compile( vectors[fmt] ) )
+			IM_FREEF( vips_vector_free, vectors[fmt] );
+	}
+
+#ifdef DEBUG
+	printf( "im__compile_programs: " );
+	for( fmt = 0; fmt < IM_BANDFMT_LAST; fmt++ ) 
+		if( vectors[fmt] )
+			printf( "%s ", im_BandFmt2char( fmt ) );
+	printf( "\n" );
+#endif /*DEBUG*/
+}
+
+static void
+build_programs( void )
+{
+	static gboolean done = FALSE;
+
+	VipsVector *v;
+
+	if( done )
+		return;
+	done = TRUE;
+
+	im__init_programs( add_vectors, bandfmt_add );
+
+	v = add_vectors[IM_BANDFMT_UCHAR];
+	vips_vector_asm2( v, "convubw", "t1", "s1" );
+	vips_vector_asm2( v, "convubw", "t2", "s2" );
+	vips_vector_asm3( v, "addw", "d1", "t1", "t2" ); 
+
+	v = add_vectors[IM_BANDFMT_CHAR];
+	vips_vector_asm2( v, "convsbw", "t1", "s1" );
+	vips_vector_asm2( v, "convsbw", "t2", "s2" );
+	vips_vector_asm3( v, "addw", "d1", "t1", "t2" ); 
+
+	/*
+
+	   only the 8-bit ones have a useful speedup, with orc-0.4.11 
+	   on a c2d anyway
+
+	   test this again at some point I guess
+
+	v = add_vectors[IM_BANDFMT_USHORT];
+	vips_vector_asm2( v, "convuwl", "t1", "s1" );
+	vips_vector_asm2( v, "convuwl", "t2", "s2" );
+	vips_vector_asm3( v, "addl", "d1", "t1", "t2" );
+
+	v = add_vectors[IM_BANDFMT_SHORT];
+	vips_vector_asm2( v, "convswl", "t1", "s1" );
+	vips_vector_asm2( v, "convswl", "t2", "s2" );
+	vips_vector_asm3( v, "addl", "d1", "t1", "t2" );
+
+	v = add_vectors[IM_BANDFMT_UINT];
+	vips_vector_asm3( v, "addl", "d1", "s1", "s2" );
+
+	v = add_vectors[IM_BANDFMT_INT];
+	vips_vector_asm3( v, "addl", "d1", "s1", "s2" );
+	 */
+
+	im__compile_programs( add_vectors );
+}
+
 /**
  * im_add:
  * @in1: input image 
@@ -387,6 +509,9 @@ static int bandfmt_add[10] = {
  * In other words, the output type is just large enough to hold the whole
  * range of possible values.
  *
+ * Operations on 8-bit images are performed using the processor's vector unit,
+ * if possible. Disable this with --vips-novector or IM_NOVECTOR.
+ *
  * See also: im_subtract(), im_lintra().
  *
  * Returns: 0 on success, -1 on error
@@ -394,6 +519,9 @@ static int bandfmt_add[10] = {
 int 
 im_add( IMAGE *in1, IMAGE *in2, IMAGE *out )
 {
+	if( vips_vector_get_enabled() ) 
+		build_programs();
+
 	return( im__arith_binary( "im_add",
 		in1, in2, out, 
 		bandfmt_add,
diff --git a/libvips/convolution/Makefile.am b/libvips/convolution/Makefile.am
index c5c2b42c..6c40e01d 100644
--- a/libvips/convolution/Makefile.am
+++ b/libvips/convolution/Makefile.am
@@ -6,7 +6,6 @@ libconvolution_la_SOURCES = \
 	im_compass.c \
 	im_conv.c \
 	im_conv_f.c \
-	im_convsep.c \
 	im_convsep_f.c \
 	im_contrast_surface.c \
 	im_fastcor.c \
diff --git a/libvips/convolution/im_conv.c b/libvips/convolution/im_conv.c
index f9c2003d..c8a20d01 100644
--- a/libvips/convolution/im_conv.c
+++ b/libvips/convolution/im_conv.c
@@ -55,6 +55,12 @@
  * 	- add a special case for 3x3 masks, about 20% faster
  * 1/10/10
  * 	- support complex (just double the bands)
+ * 18/10/10
+ * 	- add experimental Orc path
+ * 29/10/10
+ * 	- use VipsVector
+ * 	- get rid of im_convsep(), just call this twice, no longer worth
+ * 	  keeping two versions
  */
 
 /*
@@ -83,6 +89,43 @@
 
  */
 
+/* Show sample pixels as they are transformed.
+#define DEBUG_PIXELS
+ */
+
+/*
+#define DEBUG
+ */
+
+/* 
+
+ 	TODO
+
+	- will this change make much difference to the vips benchmark?
+
+	- would setting params by index rather than name be any quicker?
+
+	- fix up a signed 8-bit code path?
+
+	- try a path with a 32-bit sum for larger matrices / scale / offset, 
+	  much slower?
+
+	- try a 16-bit path, though the speedup might not be worthwhile
+
+	- with a 5x5 matrix:
+
+		5 5 62 0
+		0 1 1 1 0 
+		1 4 6 4 1 
+		1 6 10 6 1 
+		1 4 6 4 1 
+		0 1 1 1 0 
+
+	   Orc is no faster than C, argh, multipass is not worthwhile for 
+	   large matrices 
+
+ */
+
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif /*HAVE_CONFIG_H*/
@@ -93,6 +136,7 @@
 #include <limits.h>
 
 #include <vips/vips.h>
+#include <vips/vector.h>
 
 #ifdef WITH_DMALLOC
 #include <dmalloc.h>
@@ -112,12 +156,26 @@ typedef struct {
 
 	int underflow;		/* Global underflow/overflow counts */
 	int overflow;
+
+	/* The convolver we generate for this mask. We have to split the
+	 * convolve and clip into two phases.
+	 */
+	VipsVector *convolve;
+	VipsVector *clip;
 } Conv;
 
+static void
+conv_vector_free( Conv *conv )
+{
+	IM_FREEF( vips_vector_free, conv->convolve );
+	IM_FREEF( vips_vector_free, conv->clip );
+}
+
 static int
 conv_close( Conv *conv )
 {
 	IM_FREEF( im_free_imask, conv->mask );
+	conv_vector_free( conv );
 
         return( 0 );
 }
@@ -146,6 +204,197 @@ conv_evalend( Conv *conv )
         return( 0 );
 }
 
+#define TEMP( N, S ) vips_vector_temporary( v, N, S )
+#define SRC( N, P, S ) vips_vector_source( v, N, P, S )
+#define CONST( N, V, S ) vips_vector_constant( v, N, V, S )
+#define ASM2( OP, A, B ) vips_vector_asm2( v, OP, A, B )
+#define ASM3( OP, A, B, C ) vips_vector_asm3( v, OP, A, B, C )
+
+/* Generate code for a 3x3 mask. Just do multiply-add, a second pass does the
+ * round and clip.
+ *
+ * 0 for success, -1 on error.
+ */
+static int
+conv_compile_convolution_u8s16( Conv *conv )
+{
+	INTMASK *mask = conv->mask;
+
+	double min, max;
+	int i;
+	VipsVector *v;
+	char zero[256];
+	char offset[256];
+	char source[256];
+	char coeff[256];
+
+	if( conv->in->BandFmt != IM_BANDFMT_UCHAR )
+		return( -1 );
+
+	/* Don't test mask size, it's very hard to predict when we will
+	 * exhaust the program space. 
+	 */
+
+	/* Can the accumulator overflow or underflow at any stage? Since
+	 * matrix elements are signed, we need to calculate a running 
+	 * possible min and max.
+	 */
+	min = 0;
+	max = 0;
+	for( i = 0; i < mask->xsize * mask->ysize; i++ ) {
+		int v = 255 * mask->coeff[i];
+
+		if( min + v < min )
+			min += v;
+		else if( min + v > max )
+			max += v;
+
+		if( max > SHRT_MAX )
+			return( -1 );
+		if( min < SHRT_MIN )
+			return( -1 );
+	}
+
+	/* Start with a single source scanline, we add more as we need them.
+	 */
+	conv->convolve = v = vips_vector_new_ds( "conv", 2, 1 );
+
+	/* The value we fetch from the image, the product with the matrix
+	 * value, the accumulated sum.
+	 */
+	TEMP( "value", 1 );
+	TEMP( "product", 2 );
+	TEMP( "sum", 2 );
+
+	CONST( zero, 0, 2 );
+	ASM2( "copyw", "sum", zero );
+
+	for( i = 0; i < mask->xsize * mask->ysize; i++ ) {
+		int x = i % mask->xsize;
+		int y = i / mask->xsize;
+
+		if( !mask->coeff[i] )
+			/* Exclude zero elements.
+			 */
+			continue;
+
+		/* The source. s1 is the first scanline in the mask.
+		 */
+		SRC( source, y + 1, 1 );
+
+		/* The offset, only for non-first-columns though.
+		 */
+		if( x > 0 ) 
+			CONST( offset, conv->in->Bands * x, 1 );
+
+		/* The coefficient. Only for non-1 coeffs though, we skip the
+		 * mul for them.
+		 *
+		 * We need to do 8-bit unsigned pixel * signed mask, so we
+		 * have to cast the pixel up to 16-bit then do a mult against a
+		 * 16-bit constant. We know the result will fit in the botom
+		 * 16 bits.
+		 */
+		if( mask->coeff[i] != 1 ) 
+			CONST( coeff, mask->coeff[i], 2 );
+
+		/* Two factors: 
+		 * - element is in the first column, ie. has a zero offset
+		 * - mask coeff is 1, ie. we can skip the multiply
+		 *
+		 * We could combine some of these cases, but it's simpler
+		 * and safer to spell them all out.
+		 */
+		if( x == 0 ) 
+			ASM2( "loadb", "value", source );
+		else 
+			ASM3( "loadoffb", "value", source, offset );
+
+		ASM2( "convubw", "product", "value" );
+
+		if( mask->coeff[i] != 1 ) 
+			ASM3( "mullw", "product", "product", coeff );
+
+		ASM3( "addssw", "sum", "sum", "product" );
+
+		/* If we run out of space, fall back to C.
+		 */
+		if( vips_vector_full( v ) )
+			return( -1 );
+	}
+
+	ASM2( "copyw", "d1", "sum" );
+
+	if( !vips_vector_compile( v ) ) 
+		return( -1 );
+
+#ifdef DEBUG
+	vips_vector_print( v );
+#endif /*DEBUG*/
+
+	return( 0 );
+}
+
+/* Generate the program that does (sum + rounding) / scale + offset 
+ * from a s16 intermediate back to a u8 output.
+ */
+static int
+conv_compile_scale_s16u8( Conv *conv )
+{
+	INTMASK *mask = conv->mask;
+
+	VipsVector *v;
+	char scale[256];
+	char offset[256];
+	char zero[256];
+
+	/* Scale and offset must be in range.
+	 */
+	if( mask->scale > 255 ||
+		mask->scale < 0 ||
+		mask->offset > SHRT_MAX ||
+		mask->offset < SHRT_MIN ) 
+		return( -1 );
+
+	conv->clip = v = vips_vector_new_ds( "clip", 1, 2 );
+
+	TEMP( "t1", 2 );
+	TEMP( "t2", 2 );
+
+	/* We can only do unsigned divide, so we must add the offset before
+	 * dividing by the scale. We need to scale the offset up.
+	 *
+	 * We can build the rounding into the offset as well.
+	 * You might think this should be (scale + 1) / 2, but then we'd be 
+	 * adding one for scale == 1.
+	 */
+	CONST( scale, mask->scale, 1 );
+	CONST( offset, mask->offset * mask->scale + mask->scale / 2, 2 );
+	CONST( zero, 0, 2 );
+
+	/* Offset and scale. 
+	 */
+	ASM3( "addssw", "t1", "s1", offset );
+
+	/* We need to convert the signed result of the
+	 * offset to unsigned for the div, ie. we want to set anything <0 to 0.
+	 */
+	ASM3( "cmpgtsw", "t2", "t1", zero );
+	ASM3( "andw", "t1", "t1", "t2" );
+
+	ASM3( "divluw", "t1", "t1", scale );
+	ASM2( "convuuswb", "d1", "t1" );
+
+	if( !vips_vector_compile( v ) ) 
+		return( -1 );
+
+#ifdef DEBUG
+	vips_vector_print( v );
+#endif /*DEBUG*/
+
+	return( 0 );
+}
+
 static Conv *
 conv_new( IMAGE *in, IMAGE *out, INTMASK *mask )
 {
@@ -165,6 +414,9 @@ conv_new( IMAGE *in, IMAGE *out, INTMASK *mask )
         conv->underflow = 0;
         conv->overflow = 0;
 
+	conv->convolve = NULL;
+	conv->clip = NULL;
+
         if( im_add_close_callback( out, 
 		(im_callback_fn) conv_close, conv, NULL ) ||
 		im_add_close_callback( out, 
@@ -194,6 +446,14 @@ conv_new( IMAGE *in, IMAGE *out, INTMASK *mask )
 		conv->nnz = 1;
 	}
 
+	/* Generate code for this mask / image, if possible.
+	 */
+	if( vips_vector_get_enabled() ) {
+		if( conv_compile_convolution_u8s16( conv ) ||
+			conv_compile_scale_s16u8( conv ) ) 
+			conv_vector_free( conv );
+	}
+
         return( conv );
 }
 
@@ -210,6 +470,11 @@ typedef struct {
 	int overflow;
 
 	int last_bpl;		/* Avoid recalcing offsets, if we can */
+
+	/* We need an intermediate buffer to keep the result of the conv in
+	 * before we clip it.
+	 */
+	void *sum;
 } ConvSequence;
 
 /* Free a sequence value.
@@ -227,6 +492,8 @@ conv_stop( void *vseq, void *a, void *b )
 
 	IM_FREEF( im_region_free, seq->ir );
 
+	IM_FREE( seq->sum );
+
 	return( 0 );
 }
 
@@ -250,13 +517,15 @@ conv_start( IMAGE *out, void *a, void *b )
 	seq->underflow = 0;
 	seq->overflow = 0;
 	seq->last_bpl = -1;
+	seq->sum = NULL;
 
 	/* Attach region and arrays.
 	 */
 	seq->ir = im_region_create( in );
 	seq->offsets = IM_ARRAY( out, conv->nnz, int );
 	seq->pts = IM_ARRAY( out, conv->nnz, PEL * );
-	if( !seq->ir || !seq->offsets || !seq->pts ) {
+	seq->sum = IM_ARRAY( NULL, IM_IMAGE_N_ELEMENTS( in ), short );
+	if( !seq->ir || !seq->offsets || !seq->pts || !seq->sum ) {
 		conv_stop( seq, in, conv );
 		return( NULL );
 	}
@@ -333,8 +602,7 @@ conv_gen( REGION *or, void *vseq, void *a, void *b )
 	int le = r->left;
 	int to = r->top;
 	int bo = IM_RECT_BOTTOM( r );
-	int sz = IM_REGION_N_ELEMENTS( or ) * 
-		(vips_bandfmt_iscomplex( in->BandFmt ) ? 2 : 1);
+	int sz = IM_REGION_N_ELEMENTS( or ) * (im_iscomplex( in ) ? 2 : 1);
 
 	int x, y, z, i;
 
@@ -428,13 +696,13 @@ conv_gen( REGION *or, void *vseq, void *a, void *b )
 		sum = 0; \
 		sum += m[0] * p0[0]; \
 		sum += m[1] * p0[bands]; \
-		sum += m[2] * p0[bands << 1]; \
+		sum += m[2] * p0[bands * 2]; \
 		sum += m[3] * p1[0]; \
 		sum += m[4] * p1[bands]; \
-		sum += m[5] * p1[bands << 1]; \
+		sum += m[5] * p1[bands * 2]; \
 		sum += m[6] * p2[0]; \
 		sum += m[7] * p2[bands]; \
-		sum += m[8] * p2[bands << 1]; \
+		sum += m[8] * p2[bands * 2]; \
 		\
 		p0 += 1; \
 		p1 += 1; \
@@ -462,13 +730,13 @@ conv_gen( REGION *or, void *vseq, void *a, void *b )
 		sum = 0; \
 		sum += m[0] * p0[0]; \
 		sum += m[1] * p0[bands]; \
-		sum += m[2] * p0[bands << 1]; \
+		sum += m[2] * p0[bands * 2]; \
 		sum += m[3] * p1[0]; \
 		sum += m[4] * p1[bands]; \
-		sum += m[5] * p1[bands << 1]; \
+		sum += m[5] * p1[bands * 2]; \
 		sum += m[6] * p2[0]; \
 		sum += m[7] * p2[bands]; \
-		sum += m[8] * p2[bands << 1]; \
+		sum += m[8] * p2[bands * 2]; \
  		\
 		p0 += 1; \
 		p1 += 1; \
@@ -502,8 +770,7 @@ conv3x3_gen( REGION *or, void *vseq, void *a, void *b )
 	int le = r->left;
 	int to = r->top;
 	int bo = IM_RECT_BOTTOM( r );
-	int sz = IM_REGION_N_ELEMENTS( or ) * 
-		(vips_bandfmt_iscomplex( in->BandFmt ) ? 2 : 1);
+	int sz = IM_REGION_N_ELEMENTS( or ) * (im_iscomplex( in ) ? 2 : 1);
 	int bands = in->Bands;
 
 	Rect s;
@@ -568,6 +835,79 @@ conv3x3_gen( REGION *or, void *vseq, void *a, void *b )
 	return( 0 );
 }
 
+/* The VipsVector codepath.
+ */
+static int
+convvec_gen( REGION *or, void *vseq, void *a, void *b )
+{
+	ConvSequence *seq = (ConvSequence *) vseq;
+	IMAGE *in = (IMAGE *) a;
+	Conv *conv = (Conv *) b;
+	INTMASK *mask = conv->mask;
+	REGION *ir = seq->ir;
+
+	Rect *r = &or->valid;
+	int sz = IM_REGION_N_ELEMENTS( or ) * (im_iscomplex( in ) ? 2 : 1);
+
+	Rect s;
+	int y, j;
+	VipsExecutor convolve;
+	VipsExecutor clip;
+
+	/* Prepare the section of the input image we need. A little larger
+	 * than the section of the output image we are producing.
+	 */
+	s = *r;
+	s.width += mask->xsize - 1;
+	s.height += mask->ysize - 1;
+	if( im_prepare( ir, &s ) )
+		return( -1 );
+
+	vips_executor_set_program( &convolve, conv->convolve, sz );
+	vips_executor_set_program( &clip, conv->clip, sz );
+
+	/* Link the combiner to the intermediate buffer.
+	 */
+	vips_executor_set_array( &convolve, "d1", seq->sum );
+	vips_executor_set_array( &clip, "s1", seq->sum );
+
+	for( y = 0; y < r->height; y++ ) { 
+#ifdef DEBUG_PIXELS
+{
+		int h, v;
+
+		printf( "before convolve: %d, %d\n", r->left, r->top + y );
+		for( v = 0; v < mask->ysize; v++ ) {
+			for( h = 0; h < mask->xsize; h++ )
+				printf( "%3d ", *((PEL *) IM_REGION_ADDR( ir, 
+					r->left + h, r->top + y + v )) );
+			printf( "\n" );
+		}
+}
+#endif /*DEBUG_PIXELS*/
+
+		for( j = 0; j < mask->ysize; j++ )
+			vips_executor_set_source( &convolve, j + 1, 
+				IM_REGION_ADDR( ir, r->left, r->top + y + j ) );
+		vips_executor_run( &convolve );
+
+#ifdef DEBUG_PIXELS
+		printf( "before clip: %3d\n", *((signed short *) seq->sum) );
+#endif /*DEBUG_PIXELS*/
+
+		vips_executor_set_array( &clip, "d1", 
+			IM_REGION_ADDR( or, r->left, r->top + y ) );
+		vips_executor_run( &clip );
+
+#ifdef DEBUG_PIXELS
+		printf( "after clip: %d\n", 
+			*((PEL *) IM_REGION_ADDR( or, r->left, r->top + y )) );
+#endif /*DEBUG_PIXELS*/
+	}
+
+	return( 0 );
+}
+
 int
 im_conv_raw( IMAGE *in, IMAGE *out, INTMASK *mask )
 {
@@ -599,7 +939,14 @@ im_conv_raw( IMAGE *in, IMAGE *out, INTMASK *mask )
 		return( -1 );
 	}
 
-	if( mask->xsize == 3 && mask->ysize == 3 )
+	if( conv->convolve ) {
+		generate = convvec_gen;
+
+#ifdef DEBUG
+		printf( "im_conv_raw: using vector path\n" );
+#endif /*DEBUG*/
+	}
+	else if( mask->xsize == 3 && mask->ysize == 3 )
 		generate = conv3x3_gen;
 	else
 		generate = conv_gen;
@@ -631,6 +978,10 @@ im_conv_raw( IMAGE *in, IMAGE *out, INTMASK *mask )
  * and offset are part of @mask. For integer @in, the division by scale
  * includes round-to-nearest.
  *
+ * Small convolutions on unsigned 8-bit images are performed using the 
+ * processor's vector unit,
+ * if possible. Disable this with --vips-novector or IM_NOVECTOR.
+ *
  * See also: im_conv_f(), im_convsep(), im_create_imaskv().
  *
  * Returns: 0 on success, -1 on error
@@ -652,3 +1003,75 @@ im_conv( IMAGE *in, IMAGE *out, INTMASK *mask )
 
 	return( 0 );
 }
+
+int
+im_convsep_raw( IMAGE *in, IMAGE *out, INTMASK *mask )
+{
+	IMAGE *t;
+	INTMASK *rmask;
+
+	if( mask->xsize != 1 && mask->ysize != 1 ) {
+                im_error( "im_convsep", 
+			"%s", _( "expect 1xN or Nx1 input mask" ) );
+                return( -1 );
+	}
+
+	if( !(t = im_open_local( out, "im_convsep", "p" )) ||
+		!(rmask = (INTMASK *) im_local( out, 
+		(im_construct_fn) im_dup_imask,
+		(im_callback_fn) im_free_imask, mask, mask->filename, NULL )) )
+		return( -1 );
+
+	rmask->xsize = mask->ysize;
+	rmask->ysize = mask->xsize;
+
+	if( im_conv_raw( in, t, mask ) ||
+		im_conv_raw( t, out, rmask ) )
+		return( -1 );
+
+	return( 0 );
+}
+
+/**
+ * im_convsep:
+ * @in: input image
+ * @out: output image
+ * @mask: convolution mask
+ *
+ * Perform a separable convolution of @in with @mask using integer arithmetic. 
+ *
+ * The mask must be 1xn or nx1 elements. 
+ * The output image 
+ * always has the same #VipsBandFmt as the input image. 
+ *
+ * The image is convolved twice: once with @mask and then again with @mask 
+ * rotated by 90 degrees. This is much faster for certain types of mask
+ * (gaussian blur, for example) than doing a full 2D convolution.
+ *
+ * Each output pixel is
+ * calculated as sigma[i]{pixel[i] * mask[i]} / scale + offset, where scale
+ * and offset are part of @mask. For integer @in, the division by scale
+ * includes round-to-nearest.
+ *
+ * See also: im_convsep_f(), im_conv(), im_create_imaskv().
+ *
+ * Returns: 0 on success, -1 on error
+ */
+int 
+im_convsep( IMAGE *in, IMAGE *out, INTMASK *mask )
+{
+	IMAGE *t1 = im_open_local( out, "im_convsep intermediate", "p" );
+	int size = mask->xsize * mask->ysize;
+
+	if( !t1 || 
+		im_embed( in, t1, 1, size / 2, size / 2, 
+			in->Xsize + size - 1, 
+			in->Ysize + size - 1 ) ||
+		im_convsep_raw( t1, out, mask ) )
+		return( -1 );
+
+	out->Xoffset = 0;
+	out->Yoffset = 0;
+
+	return( 0 );
+}
diff --git a/libvips/convolution/im_convsep.c b/libvips/convolution/im_convsep.c
deleted file mode 100644
index 30dedccc..00000000
--- a/libvips/convolution/im_convsep.c
+++ /dev/null
@@ -1,461 +0,0 @@
-/* im_convsep
- *
- * Copyright: 1990, N. Dessipris.
- *
- * Author: Nicos Dessipris
- * Written on: 29/04/1991
- * Modified on: 29/4/93 K.Martinez  for Sys5
- * 9/3/01 JC
- *	- rewritten using im_conv()
- * 27/7/01 JC
- *	- rejects masks with scale == 0
- * 7/4/04 
- *	- now uses im_embed() with edge stretching on the input, not
- *	  the output
- *	- sets Xoffset / Yoffset
- * 21/4/04
- *	- scale down int convolves at 1/2 way mark, much less likely to integer
- *	  overflow on intermediates
- * 12/5/08
- * 	- int rounding was +1 too much, argh
- * 3/2/10
- * 	- gtkdoc
- * 	- more cleanups
- * 1/10/10
- * 	- support complex (just double the bands)
- */
-
-/*
-
-    This file is part of VIPS.
-    
-    VIPS is free software; you can redistribute it and/or modify
-    it under the terms of the GNU Lesser General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
- */
-
-/*
-
-    These files are distributed with VIPS - http://www.vips.ecs.soton.ac.uk
-
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif /*HAVE_CONFIG_H*/
-#include <vips/intl.h>
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <limits.h>
-
-#include <vips/vips.h>
-
-#ifdef WITH_DMALLOC
-#include <dmalloc.h>
-#endif /*WITH_DMALLOC*/
-
-/* Our parameters ... we take a copy of the mask argument.
- */
-typedef struct {
-	IMAGE *in;
-	IMAGE *out;
-	INTMASK *mask;	/* Copy of mask arg */
-
-	int size;	/* N for our 1xN or Nx1 mask */
-	int scale;	/* Our scale ... we have to square mask->scale */
-
-	int underflow;	/* Global underflow/overflow counts */
-	int overflow;
-} Conv;
-
-/* End of evaluation --- print overflows and underflows.
- */
-static int
-conv_destroy( Conv *conv )
-{
-	/* Print underflow/overflow count.
-	 */
-	if( conv->overflow || conv->underflow )
-		im_warn( "im_convsep", _( "%d overflows and %d underflows "
-			"detected" ), conv->overflow, conv->underflow );
-
-	if( conv->mask ) {
-		(void) im_free_imask( conv->mask );
-		conv->mask = NULL;
-	}
-
-        return( 0 );
-}
-
-static Conv *
-conv_new( IMAGE *in, IMAGE *out, INTMASK *mask )
-{
-        Conv *conv = IM_NEW( out, Conv );
-
-        if( !conv )
-                return( NULL );
-
-        conv->in = in;
-        conv->out = out;
-        conv->mask = NULL;
-	conv->size = mask->xsize * mask->ysize;
-	conv->scale = mask->scale * mask->scale;
-        conv->underflow = 0;
-        conv->overflow = 0;
-
-        if( im_add_close_callback( out, 
-		(im_callback_fn) conv_destroy, conv, NULL ) ||
-		!(conv->mask = im_dup_imask( mask, "conv_mask" )) )
-                return( NULL );
-
-        return( conv );
-}
-
-/* Our sequence value.
- */
-typedef struct {
-	Conv *conv;
-	REGION *ir;		/* Input region */
-
-	PEL *sum;		/* Line buffer */
-
-	int underflow;		/* Underflow/overflow counts */
-	int overflow;
-} ConvSequence;
-
-/* Free a sequence value.
- */
-static int
-conv_stop( void *vseq, void *a, void *b )
-{
-	ConvSequence *seq = (ConvSequence *) vseq;
-	Conv *conv = (Conv *) b;
-
-	/* Add local under/over counts to global counts.
-	 */
-	conv->overflow += seq->overflow;
-	conv->underflow += seq->underflow;
-
-	IM_FREEF( im_region_free, seq->ir );
-
-	return( 0 );
-}
-
-/* Convolution start function.
- */
-static void *
-conv_start( IMAGE *out, void *a, void *b )
-{
-	IMAGE *in = (IMAGE *) a;
-	Conv *conv = (Conv *) b;
-	ConvSequence *seq;
-
-	if( !(seq = IM_NEW( out, ConvSequence )) )
-		return( NULL );
-
-	/* Init!
-	 */
-	seq->conv = conv;
-	seq->ir = NULL;
-	seq->sum = NULL;
-	seq->underflow = 0;
-	seq->overflow = 0;
-
-	/* Attach region and arrays.
-	 */
-	seq->ir = im_region_create( in );
-	if( vips_bandfmt_isint( conv->out->BandFmt ) )
-		seq->sum = (PEL *) 
-			IM_ARRAY( out, IM_IMAGE_N_ELEMENTS( in ), int );
-	else
-		seq->sum = (PEL *) 
-			IM_ARRAY( out, IM_IMAGE_N_ELEMENTS( in ), double );
-	if( !seq->ir || !seq->sum ) {
-		conv_stop( seq, in, conv );
-		return( NULL );
-	}
-
-	return( (void *) seq );
-}
-
-/* What we do for every point in the mask, for each pixel.
- */
-#define VERTICAL_CONV { z -= 1; li -= lskip; sum += coeff[z] * vfrom[li]; }
-#define HORIZONTAL_CONV { z -= 1; li -= bands; sum += coeff[z] * hfrom[li]; }
-
-/* INT and FLOAT inner loops.
- */
-#define CONV_INT( TYPE, IM_CLIP ) { \
-	TYPE *vfrom; \
-	int *vto; \
-	int *hfrom; \
-	TYPE *hto; \
- 	\
-	/* Convolve to sum array. We convolve the full width of \
-	 * this input line. \
-	 */ \
-	vfrom = (TYPE *) IM_REGION_ADDR( ir, le, y ); \
-	vto = (int *) seq->sum; \
-	for( x = 0; x < isz; x++ ) {   \
-		int sum; \
-		 \
-		z = conv->size;  \
-		li = lskip * z; \
-		sum = 0; \
- 		\
-		IM_UNROLL( z, VERTICAL_CONV ); \
- 		\
-		sum = ((sum + rounding) / mask->scale) + mask->offset; \
-		\
-		vto[x] = sum;   \
-		vfrom += 1; \
-	}  \
- 	\
-	/* Convolve sums to output. \
-	 */ \
-	hfrom = (int *) seq->sum; \
-	hto = (TYPE *) IM_REGION_ADDR( or, le, y );  \
-	for( x = 0; x < osz; x++ ) { \
-		int sum; \
-		 \
-		z = conv->size;  \
-		li = bands * z; \
-		sum = 0; \
- 		\
-		IM_UNROLL( z, HORIZONTAL_CONV ); \
- 		\
-		sum = ((sum + rounding) / mask->scale) + mask->offset; \
- 		\
-		IM_CLIP; \
- 		\
-		hto[x] = sum;   \
-		hfrom += 1; \
-	} \
-}
-
-#define CONV_FLOAT( TYPE ) { \
-	TYPE *vfrom; \
-	double *vto; \
-	double *hfrom; \
-	TYPE *hto; \
- 	\
-	/* Convolve to sum array. We convolve the full width of \
-	 * this input line. \
-	 */ \
-	vfrom = (TYPE *) IM_REGION_ADDR( ir, le, y ); \
-	vto = (double *) seq->sum; \
-	for( x = 0; x < isz; x++ ) {   \
-		double sum; \
-		 \
-		z = conv->size;  \
-		li = lskip * z; \
-		sum = 0; \
- 		\
-		IM_UNROLL( z, VERTICAL_CONV ); \
- 		\
-		vto[x] = sum;   \
-		vfrom += 1; \
-	}  \
- 	\
-	/* Convolve sums to output. \
-	 */ \
-	hfrom = (double *) seq->sum; \
-	hto = (TYPE *) IM_REGION_ADDR( or, le, y );  \
-	for( x = 0; x < osz; x++ ) { \
-		double sum; \
-		 \
-		z = conv->size;  \
-		li = bands * z; \
-		sum = 0; \
- 		\
-		IM_UNROLL( z, HORIZONTAL_CONV ); \
- 		\
-		sum = (sum / conv->scale) + mask->offset; \
- 		\
-		hto[x] = sum;   \
-		hfrom += 1; \
-	} \
-}
-
-/* Convolve!
- */
-static int
-conv_gen( REGION *or, void *vseq, void *a, void *b )
-{
-	ConvSequence *seq = (ConvSequence *) vseq;
-	IMAGE *in = (IMAGE *) a;
-	Conv *conv = (Conv *) b;
-	REGION *ir = seq->ir;
-	INTMASK *mask = conv->mask;
-
-	/* You might think this should be (scale+1)/2, but then we'd be adding
-	 * one for scale == 1.
-	 */
-	int rounding = mask->scale / 2;
-
-	int bands = in->Bands;
-	int *coeff = conv->mask->coeff; 
-
-	Rect *r = &or->valid;
-	int le = r->left;
-	int to = r->top;
-	int bo = IM_RECT_BOTTOM(r);
-	int osz = IM_REGION_N_ELEMENTS( or ) * 
-		(vips_bandfmt_iscomplex( in->BandFmt ) ? 2 : 1);
-
-	Rect s;
-	int lskip;
-	int isz;
-	int x, y, z, li;
-
-	/* Prepare the section of the input image we need. A little larger
-	 * than the section of the output image we are producing.
-	 */
-	s = *r;
-	s.width += conv->size - 1;
-	s.height += conv->size - 1;
-	if( im_prepare( ir, &s ) )
-		return( -1 );
-	lskip = IM_REGION_LSKIP( ir ) / IM_IMAGE_SIZEOF_ELEMENT( in );
-	isz = IM_REGION_N_ELEMENTS( ir );
-
-	for( y = to; y < bo; y++ ) { 
-		switch( in->BandFmt ) {
-		case IM_BANDFMT_UCHAR: 	
-			CONV_INT( unsigned char, IM_CLIP_UCHAR( sum, seq ) ); 
-			break;
-		case IM_BANDFMT_CHAR:   
-			CONV_INT( signed char, IM_CLIP_CHAR( sum, seq ) ); 
-			break;
-		case IM_BANDFMT_USHORT: 
-			CONV_INT( unsigned short, IM_CLIP_USHORT( sum, seq ) ); 
-			break;
-		case IM_BANDFMT_SHORT:  
-			CONV_INT( signed short, IM_CLIP_SHORT( sum, seq ) ); 
-			break;
-		case IM_BANDFMT_UINT:   
-			CONV_INT( unsigned int, IM_CLIP_NONE( sum, seq ) ); 
-			break;
-		case IM_BANDFMT_INT:    
-			CONV_INT( signed int, IM_CLIP_NONE( sum, seq ) ); 
-			break;
-		case IM_BANDFMT_FLOAT:  
-		case IM_BANDFMT_COMPLEX:  
-			CONV_FLOAT( float ); 
-			break;
-		case IM_BANDFMT_DOUBLE: 
-		case IM_BANDFMT_DPCOMPLEX:  
-			CONV_FLOAT( double ); 
-			break;
-
-		default:
-			g_assert( 0 );
-		}
-	}
-
-	return( 0 );
-}
-
-int
-im_convsep_raw( IMAGE *in, IMAGE *out, INTMASK *mask )
-{
-	Conv *conv;
-
-	/* Check parameters.
-	 */
-	if( im_piocheck( in, out ) ||
-		im_check_uncoded( "im_convsep", in ) ||
-		im_check_imask( "im_convsep", mask ) ) 
-		return( -1 );
-	if( mask->xsize != 1 && mask->ysize != 1 ) {
-                im_error( "im_convsep", 
-			"%s", _( "expect 1xN or Nx1 input mask" ) );
-                return( -1 );
-	}
-	if( mask->scale == 0 ) {
-		im_error( "im_convsep", "%s", "mask scale must be non-zero" );
-		return( -1 );
-	}
-	if( !(conv = conv_new( in, out, mask )) )
-		return( -1 );
-
-	/* Prepare output. Consider a 7x7 mask and a 7x7 image --- the output
-	 * would be 1x1.
-	 */
-	if( im_cp_desc( out, in ) )
-		return( -1 );
-	out->Xsize -= conv->size - 1;
-	out->Ysize -= conv->size - 1;
-	if( out->Xsize <= 0 || out->Ysize <= 0 ) {
-		im_error( "im_convsep", "%s", _( "image too small for mask" ) );
-		return( -1 );
-	}
-
-	/* SMALLTILE seems the fastest in benchmarks.
-	 */
-	if( im_demand_hint( out, IM_SMALLTILE, in, NULL ) ||
-		im_generate( out, conv_start, conv_gen, conv_stop, in, conv ) )
-		return( -1 );
-
-	out->Xoffset = -mask->xsize / 2;
-	out->Yoffset = -mask->ysize / 2;
-
-	return( 0 );
-}
-
-
-/**
- * im_convsep:
- * @in: input image
- * @out: output image
- * @mask: convolution mask
- *
- * Perform a separable convolution of @in with @mask using integer arithmetic. 
- *
- * The mask must be 1xn or nx1 elements. 
- * The output image 
- * always has the same #VipsBandFmt as the input image. 
- *
- * The image is convolved twice: once with @mask and then again with @mask 
- * rotated by 90 degrees. This is much faster for certain types of mask
- * (gaussian blur, for example) than doing a full 2D convolution.
- *
- * Each output pixel is
- * calculated as sigma[i]{pixel[i] * mask[i]} / scale + offset, where scale
- * and offset are part of @mask. For integer @in, the division by scale
- * includes round-to-nearest.
- *
- * See also: im_convsep_f(), im_conv(), im_create_imaskv().
- *
- * Returns: 0 on success, -1 on error
- */
-int 
-im_convsep( IMAGE *in, IMAGE *out, INTMASK *mask )
-{
-	IMAGE *t1 = im_open_local( out, "im_convsep intermediate", "p" );
-	int size = mask->xsize * mask->ysize;
-
-	if( !t1 || 
-		im_embed( in, t1, 1, size / 2, size / 2, 
-			in->Xsize + size - 1, 
-			in->Ysize + size - 1 ) ||
-		im_convsep_raw( t1, out, mask ) )
-		return( -1 );
-
-	out->Xoffset = 0;
-	out->Yoffset = 0;
-
-	return( 0 );
-}
diff --git a/libvips/include/vips/Makefile.am b/libvips/include/vips/Makefile.am
index b4fa2c8b..94e1b056 100644
--- a/libvips/include/vips/Makefile.am
+++ b/libvips/include/vips/Makefile.am
@@ -43,6 +43,7 @@ pkginclude_HEADERS = \
 	transform.h \
 	util.h \
 	version.h \
+	vector.h \
 	vips.h 
 
 vipsc++.h:
diff --git a/libvips/include/vips/image.h b/libvips/include/vips/image.h
index 574de6f8..cbb8c067 100644
--- a/libvips/include/vips/image.h
+++ b/libvips/include/vips/image.h
@@ -87,7 +87,8 @@ typedef enum {
 	IM_BANDFMT_FLOAT = 6,
 	IM_BANDFMT_COMPLEX = 7,
 	IM_BANDFMT_DOUBLE = 8,
-	IM_BANDFMT_DPCOMPLEX = 9
+	IM_BANDFMT_DPCOMPLEX = 9,
+	IM_BANDFMT_LAST = 10
 } VipsBandFmt;
 
 typedef enum {
diff --git a/libvips/include/vips/util.h b/libvips/include/vips/util.h
index 27080247..efdf0e7f 100644
--- a/libvips/include/vips/util.h
+++ b/libvips/include/vips/util.h
@@ -56,6 +56,13 @@ extern "C" {
 #define IM_CLIP(A,V,B) IM_MAX( (A), IM_MIN( (B), (V) ) )
 #define IM_NUMBER(R) ((int)(sizeof(R)/sizeof(R[0])))
 
+#define IM_SWAP( TYPE, A, B ) \
+G_STMT_START { \
+	TYPE t = (A); \
+	(A) = (B); \
+	(B) = t; \
+} G_STMT_END
+
 #define IM_FREEF( F, S ) \
 G_STMT_START \
         if( S ) { \
@@ -90,7 +97,8 @@ G_STMT_START { \
 
 /* Duff's device. Do OPERation N times in a 16-way unrolled loop.
  */
-#define IM_UNROLL( N, OPER ) { \
+#define IM_UNROLL( N, OPER ) \
+G_STMT_START \
 	if( (N) ) { \
 		int duff_count = ((N) + 15) / 16; \
 		\
@@ -114,7 +122,7 @@ G_STMT_START { \
 			 } while( --duff_count > 0 ); \
 		} \
 	} \
-}
+G_STMT_END
 
 /* Round a float to the nearest integer. Much faster than rint(). 
  */
@@ -122,7 +130,8 @@ G_STMT_START { \
 
 /* Various integer range clips. Record over/under flows.
  */
-#define IM_CLIP_UCHAR( V, SEQ ) { \
+#define IM_CLIP_UCHAR( V, SEQ ) \
+G_STMT_START \
 	if( (V) < 0 ) {   \
 		(SEQ)->underflow++;   \
 		(V) = 0;   \
@@ -131,9 +140,10 @@ G_STMT_START { \
 		(SEQ)->overflow++;   \
 		(V) = UCHAR_MAX;   \
 	}  \
-}
+G_STMT_END
 
-#define IM_CLIP_USHORT( V, SEQ ) { \
+#define IM_CLIP_USHORT( V, SEQ ) \
+G_STMT_START \
 	if( (V) < 0 ) {   \
 		(SEQ)->underflow++;   \
 		(V) = 0;   \
@@ -142,9 +152,10 @@ G_STMT_START { \
 		(SEQ)->overflow++;   \
 		(V) = USHRT_MAX;   \
 	}  \
-}
+G_STMT_END
 
-#define IM_CLIP_CHAR( V, SEQ ) { \
+#define IM_CLIP_CHAR( V, SEQ ) \
+G_STMT_START \
 	if( (V) < SCHAR_MIN ) {   \
 		(SEQ)->underflow++;   \
 		(V) = SCHAR_MIN;   \
@@ -153,9 +164,10 @@ G_STMT_START { \
 		(SEQ)->overflow++;   \
 		(V) = SCHAR_MAX;   \
 	}  \
-}
+G_STMT_END
 
-#define IM_CLIP_SHORT( V, SEQ ) { \
+#define IM_CLIP_SHORT( V, SEQ ) \
+G_STMT_START \
 	if( (V) < SHRT_MIN ) {   \
 		(SEQ)->underflow++;   \
 		(V) = SHRT_MIN;   \
@@ -164,7 +176,7 @@ G_STMT_START { \
 		(SEQ)->overflow++;   \
 		(V) = SHRT_MAX;   \
 	}  \
-}
+G_STMT_END
 
 #define IM_CLIP_NONE( V, SEQ ) {}
 
diff --git a/libvips/include/vips/vector.h b/libvips/include/vips/vector.h
new file mode 100644
index 00000000..a07691e7
--- /dev/null
+++ b/libvips/include/vips/vector.h
@@ -0,0 +1,115 @@
+/* helper stuff for Orc
+ *
+ * 29/10/10
+ *	- from im_dilate hackery
+ */
+
+/*
+
+    This file is part of VIPS.
+    
+    VIPS is free software; you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+ */
+
+/*
+
+    These files are distributed with VIPS - http://www.vips.ecs.soton.ac.uk
+
+ */
+
+#ifndef IM_VECTOR_H
+#define IM_VECTOR_H
+
+#ifdef HAVE_ORC
+#include <orc/orc.h>
+#endif /*HAVE_ORC*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif /*__cplusplus*/
+
+/* An Orc program. 
+ */
+typedef struct {
+	/* Handy for debugging.
+	 */
+	const char *name;
+
+	/* How many resources we've used so far in this codegen. 
+	 */
+	int n_temp;
+	int n_source;
+	int n_destination;
+	int n_constant;
+	int n_parameter;
+	int n_instruction;
+
+#ifdef HAVE_ORC
+        /* The code we have generated.
+	 */
+        OrcProgram *program;
+#endif /*HAVE_ORC*/
+
+	/* Compiled successfully.
+	 */
+	gboolean compiled;
+} VipsVector;
+
+#ifdef HAVE_ORC
+typedef OrcExecutor VipsExecutor;
+#else /*!HAVE_ORC*/
+typedef int VipsExecutor;
+#endif /*HAVE_ORC*/
+
+/* Set from the command-line.
+ */
+extern gboolean im__vector_enabled;
+
+void vips_vector_init( void );
+gboolean vips_vector_get_enabled( void );
+void vips_vector_set_enabled( gboolean enabled );
+
+void vips_vector_free( VipsVector *vector );
+VipsVector *vips_vector_new_ds( const char *name, int size1, int size2 );
+
+void vips_vector_constant( VipsVector *vector, 
+	char *name, int value, int size );
+void vips_vector_source_name( VipsVector *vector, char *name, int size );
+void vips_vector_source( VipsVector *vector, char *name, int number, int size );
+void vips_vector_temporary( VipsVector *vector, char *name, int size );
+void vips_vector_asm2( VipsVector *vector, 
+	const char *op, const char *a, const char *b );
+void vips_vector_asm3( VipsVector *vector, 
+	const char *op, const char *a, const char *b, const char *c );
+gboolean vips_vector_full( VipsVector *vector );
+
+gboolean vips_vector_compile( VipsVector *vector );
+
+void vips_vector_print( VipsVector *vector );
+
+void vips_executor_set_program( VipsExecutor *executor, 
+	VipsVector *vector, int n );
+void vips_executor_set_source( VipsExecutor *executor, int n, void *value );
+void vips_executor_set_destination( VipsExecutor *executor, void *value );
+void vips_executor_set_array( VipsExecutor *executor, char *name, void *value );
+
+void vips_executor_run( VipsExecutor *executor );
+
+#ifdef __cplusplus
+}
+#endif /*__cplusplus*/
+
+#endif /*IM_VECTOR_H*/
diff --git a/libvips/inplace/flood.c b/libvips/inplace/flood.c
index d62967bf..b00ecd74 100644
--- a/libvips/inplace/flood.c
+++ b/libvips/inplace/flood.c
@@ -73,12 +73,6 @@
 #include <dmalloc.h>
 #endif /*WITH_DMALLOC*/
 
-#define SWAP( TYPE, A, B ) { \
-	TYPE t = (A); \
-	(A) = (B); \
-	(B) = t; \
-}
-
 /* Size of a scanline buffer. We allocate a list of these to hold scanlines 
  * we need to visit.
  */
@@ -346,7 +340,7 @@ flood_all( Flood *flood, int x, int y )
 			p->n = 0;
 		}
 
-		SWAP( Buffer *, flood->in, flood->out );
+		IM_SWAP( Buffer *, flood->in, flood->out );
 	}
 }
 
diff --git a/libvips/inplace/im_draw_line.c b/libvips/inplace/im_draw_line.c
index 9ee5548b..8572ffa3 100644
--- a/libvips/inplace/im_draw_line.c
+++ b/libvips/inplace/im_draw_line.c
@@ -64,8 +64,6 @@
 #include <dmalloc.h>
 #endif /*WITH_DMALLOC*/
 
-#define SWAP(A,B) {int t; t = (A); (A) = (B); (B) = t;}
-
 typedef struct _Line {
 	Draw draw;
 
@@ -112,14 +110,14 @@ line_new( VipsImage *im, int x1, int y1, int x2, int y2, PEL *ink )
 		 * right. Do diagonals here .. just have up and right and down
 		 * and right now.
 		 */
-		SWAP( x1, x2 );
-		SWAP( y1, y2 );
+		IM_SWAP( int, x1, x2 );
+		IM_SWAP( int, y1, y2 );
 	}
 	else if( abs( line->dx ) < abs( line->dy ) && line->dy < 0 ) {
 		/* Swap to get all y greater cases going down the screen.
 		 */
-		SWAP( x1, x2 );
-		SWAP( y1, y2 );
+		IM_SWAP( int, x1, x2 );
+		IM_SWAP( int, y1, y2 );
 	}
 
 	/* Recalculate dx, dy.
diff --git a/libvips/iofuncs/Makefile.am b/libvips/iofuncs/Makefile.am
index 6c30108e..e1e1b456 100644
--- a/libvips/iofuncs/Makefile.am
+++ b/libvips/iofuncs/Makefile.am
@@ -44,6 +44,7 @@ libiofuncs_la_SOURCES = \
 	im_init_world.c \
 	buf.c \
 	window.c \
+	vector.c \
 	buffer.c \
 	time.c 
 
diff --git a/libvips/iofuncs/im_init_world.c b/libvips/iofuncs/im_init_world.c
index 3874a8d4..cb883d16 100644
--- a/libvips/iofuncs/im_init_world.c
+++ b/libvips/iofuncs/im_init_world.c
@@ -63,6 +63,7 @@
 #include <vips/vips.h>
 #include <vips/thread.h>
 #include <vips/internal.h>
+#include <vips/vector.h>
 
 #ifdef WITH_DMALLOC
 #include <dmalloc.h>
@@ -223,6 +224,10 @@ im_init_world( const char *argv0 )
 	 */
 	im__buffer_init();
 
+	/* Get the run-time compiler going.
+	 */
+	vips_vector_init();
+
 	done = TRUE;
 
 	return( 0 );
@@ -268,6 +273,9 @@ static GOptionEntry option_entries[] = {
 	{ "vips-disc-threshold", 'd', 0, G_OPTION_ARG_STRING, 
 		&im__disc_threshold, 
 		N_( "image size above which to decompress to disc" ), NULL },
+	{ "vips-novector", 't', G_OPTION_FLAG_REVERSE, G_OPTION_ARG_NONE, 
+			&im__vector_enabled, 
+		N_( "disable vectorised versions of operations" ), NULL },
 	{ NULL }
 };
 
diff --git a/libvips/iofuncs/vector.c b/libvips/iofuncs/vector.c
new file mode 100644
index 00000000..395ec429
--- /dev/null
+++ b/libvips/iofuncs/vector.c
@@ -0,0 +1,330 @@
+/* helper functions for Orc
+ *
+ * 29/10/10
+ * 	- from morph hacking
+ */
+
+/*
+
+    This file is part of VIPS.
+    
+    VIPS is free software; you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+ */
+
+/*
+
+    These files are distributed with VIPS - http://www.vips.ecs.soton.ac.uk
+
+ */
+
+/* 
+
+ 	TODO
+
+	- would setting params by index rather than name be any quicker?
+
+ */
+
+/* Verbose messages from Orc (or use ORC_DEBUG=99 on the command-line).
+#define DEBUG_ORC
+ */
+
+/*
+#define DEBUG
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif /*HAVE_CONFIG_H*/
+#include <vips/intl.h>
+
+#include <vips/vips.h>
+#include <vips/vector.h>
+
+#ifdef WITH_DMALLOC
+#include <dmalloc.h>
+#endif /*WITH_DMALLOC*/
+
+/* Cleared by the command-line --vips-novector switch and the IM_NOVECTOR env
+ * var.
+ */
+gboolean im__vector_enabled = TRUE;
+
+void 
+vips_vector_init( void )
+{
+#ifdef HAVE_ORC
+	orc_init();
+
+#ifdef DEBUG_ORC
+	/* You can also do ORC_DEBUG=99 at the command-line.
+	 */
+	orc_debug_set_level( 99 );
+#endif /*DEBUG_ORC*/
+
+	/* Look for the environment variable IM_NOVECTOR and use that to turn
+	 * off as well.
+	 */
+	if( g_getenv( "IM_NOVECTOR" ) ) 
+		im__vector_enabled = FALSE;
+#endif /*HAVE_ORC*/
+}
+
+gboolean 
+vips_vector_get_enabled( void )
+{
+	return( im__vector_enabled );
+}
+
+void 
+vips_vector_set_enabled( gboolean enabled )
+{
+	im__vector_enabled = enabled;
+}
+
+void
+vips_vector_free( VipsVector *vector )
+{
+#ifdef HAVE_ORC
+	IM_FREEF( orc_program_free, vector->program );
+#endif /*HAVE_ORC*/
+	IM_FREE( vector );
+}
+
+VipsVector *
+vips_vector_new_ds( const char *name, int size1, int size2 )
+{
+	VipsVector *vector;
+
+	if( !(vector = IM_NEW( NULL, VipsVector )) )
+		return( NULL );
+	vector->name = name;
+	vector->n_temp = 0;
+	vector->n_source = 0;
+	vector->n_destination = 0;
+	vector->n_constant = 0;
+	vector->n_parameter = 0;
+	vector->n_instruction = 0;
+	vector->compiled = FALSE;
+
+#ifdef HAVE_ORC
+	vector->program = orc_program_new_ds( size1, size2 );
+#endif /*HAVE_ORC*/
+	vector->n_source += 1;
+	vector->n_destination += 1;
+
+	return( vector );
+}
+
+void 
+vips_vector_asm2( VipsVector *vector, 
+	const char *op, const char *a, const char *b )
+{
+	vector->n_instruction += 1;
+
+#ifdef DEBUG
+	 printf( "  %s %s %s\n", op, a, b );
+#endif /*DEBUG*/
+
+#ifdef HAVE_ORC
+	 orc_program_append_ds_str( vector->program, op, a, b );
+#endif /*HAVE_ORC*/
+}
+
+void 
+vips_vector_asm3( VipsVector *vector, 
+	const char *op, const char *a, const char *b, const char *c )
+{
+	vector->n_instruction += 1;
+
+#ifdef DEBUG
+	 printf( "  %s %s %s %s\n", op, a, b, c );
+#endif /*DEBUG*/
+
+#ifdef HAVE_ORC
+	 orc_program_append_str( vector->program, op, a, b, c );
+#endif /*HAVE_ORC*/
+}
+
+void
+vips_vector_constant( VipsVector *vector, char *name, int value, int size )
+{
+#ifdef HAVE_ORC
+	char *sname;
+
+	if( size == 1 )
+		sname = "b";
+	else if( size == 2 )
+		sname = "w";
+	else if( size == 4 )
+		sname = "l";
+	else {
+		printf( "vips_vector_constant: bad constant size\n" );
+
+		/* Not really correct, heh.
+		 */
+		sname = "x";
+	}
+
+	if( value > 0 )
+		im_snprintf( name, 256, "c%d%s", value, sname );
+	else
+		im_snprintf( name, 256, "cm%d%s", -value, sname );
+
+	if( orc_program_find_var_by_name( vector->program, name ) == -1 ) {
+		orc_program_add_constant( vector->program, size, value, name );
+		vector->n_constant += 1;
+	}
+#endif /*HAVE_ORC*/
+}
+
+void
+vips_vector_source_name( VipsVector *vector, char *name, int size )
+{
+#ifdef HAVE_ORC
+#ifdef DEBUG
+	if( orc_program_find_var_by_name( vector->program, name ) != -1 ) 
+		printf( "argh! source %s defined twice\n", name );
+#endif /*DEBUG*/
+
+	orc_program_add_source( vector->program, size, name );
+	vector->n_source += 1;
+#endif /*HAVE_ORC*/
+}
+
+void
+vips_vector_source( VipsVector *vector, char *name, int number, int size )
+{
+#ifdef HAVE_ORC
+	im_snprintf( name, 256, "s%d", number );
+
+	if( orc_program_find_var_by_name( vector->program, name ) == -1 ) 
+		vips_vector_source_name( vector, name, size ); 
+#endif /*HAVE_ORC*/
+}
+
+void
+vips_vector_temporary( VipsVector *vector, char *name, int size )
+{
+#ifdef HAVE_ORC
+	orc_program_add_temporary( vector->program, size, name );
+	vector->n_temp += 1;
+#endif /*HAVE_ORC*/
+}
+
+gboolean
+vips_vector_full( VipsVector *vector )
+{
+	/* We can need a max of 2 constants plus one source per
+	 * coefficient, so stop if we're sure we don't have enough.
+	 * We need to stay under the 100 instruction limit too.
+	 */
+	if( vector->n_constant > 16 - 2 )
+		return( TRUE );
+	if( vector->n_source > 8 - 1 )
+		return( TRUE );
+	if( vector->n_instruction > 50 )
+		return( TRUE );
+
+	return( FALSE );
+}
+
+gboolean
+vips_vector_compile( VipsVector *vector )
+{
+#ifdef HAVE_ORC
+	OrcCompileResult result;
+
+	result = orc_program_compile( vector->program );
+	if( !ORC_COMPILE_RESULT_IS_SUCCESSFUL( result ) ) {
+#ifdef DEBUG
+		printf( "*** error compiling %s\n", vector->name );
+#endif /*DEBUG*/
+
+		return( FALSE );
+	}
+
+	vector->compiled = TRUE;
+#endif /*HAVE_ORC*/
+
+	return( TRUE );
+}
+
+void
+vips_vector_print( VipsVector *vector )
+{
+	printf( "%s: ", vector->name );
+	if( vector->compiled )
+		printf( "successfully compiled\n" );
+	else
+		printf( "not compiled successfully\n" );
+	printf( "  n_source = %d\n", vector->n_source );
+	printf( "  n_parameter = %d\n", vector->n_parameter );
+	printf( "  n_destination = %d\n", vector->n_destination );
+	printf( "  n_constant = %d\n", vector->n_constant );
+	printf( "  n_temp = %d\n", vector->n_temp );
+	printf( "  n_instruction = %d\n", vector->n_instruction );
+}
+
+void
+vips_executor_set_program( VipsExecutor *executor, VipsVector *vector, int n )
+{
+#ifdef HAVE_ORC
+	orc_executor_set_program( executor, vector->program );
+	orc_executor_set_n( executor, n );
+#endif /*HAVE_ORC*/
+}
+
+void
+vips_executor_set_source( VipsExecutor *executor, int n, void *value )
+{
+#ifdef HAVE_ORC
+	char name[256];
+	OrcProgram *program = executor->program;
+
+	im_snprintf( name, 256, "s%d", n );
+	if( orc_program_find_var_by_name( program, name ) != -1 ) 
+		orc_executor_set_array_str( executor, name, value );
+#endif /*HAVE_ORC*/
+}
+
+void
+vips_executor_set_destination( VipsExecutor *executor, void *value )
+{
+#ifdef HAVE_ORC
+	orc_executor_set_array_str( executor, "d1", value );
+#endif /*HAVE_ORC*/
+}
+
+void
+vips_executor_set_array( VipsExecutor *executor, char *name, void *value )
+{
+#ifdef HAVE_ORC
+	OrcProgram *program = executor->program;
+
+	if( orc_program_find_var_by_name( program, name ) != -1 ) 
+		orc_executor_set_array_str( executor, name, value );
+#endif /*HAVE_ORC*/
+}
+
+void
+vips_executor_run( VipsExecutor *executor )
+{
+#ifdef HAVE_ORC
+	orc_executor_run( executor );
+#endif /*HAVE_ORC*/
+}
+
diff --git a/libvips/morphology/Makefile.am b/libvips/morphology/Makefile.am
index 9de3bb12..1c32ef82 100644
--- a/libvips/morphology/Makefile.am
+++ b/libvips/morphology/Makefile.am
@@ -2,8 +2,7 @@ noinst_LTLIBRARIES = libmorphology.la
 
 libmorphology_la_SOURCES = \
 	im_cntlines.c \
-	im_dilate.c\
-	im_erode.c\
+	morphology.c\
 	im_rank.c \
 	im_rank_image.c \
 	im_zerox.c \
diff --git a/libvips/morphology/im_dilate.c b/libvips/morphology/im_dilate.c
deleted file mode 100644
index 3df3c666..00000000
--- a/libvips/morphology/im_dilate.c
+++ /dev/null
@@ -1,349 +0,0 @@
-/* @(#) Function which dilates a binary VASARI format picture with a mask.
- * @(#) The mask coefficients are either 255 (object) or 0 (bk) or 128 (any).
- * @(#) Input image are binary images with either 0 or 255 values, one channel 
- * @(#) only. The program dilates a white object on a black background.
- * @(#) The center of the mask is at location (m->xsize/2, m->ysize/2)
- * @(#) integer division. The mask is expected to have an odd width and
- * @(#) height.
- * @(#)
- * @(#) int im_dilate(in, out, m)
- * @(#) IMAGE *in, *out;
- * @(#) INTMASK *m;
- * @(#)
- * @(#) Returns either 0 (sucess) or -1 (fail)
- *
- * 19/9/95 JC
- *	- rewritten
- * 6/7/99 JC
- *	- small tidies
- * 7/4/04 
- *	- now uses im_embed() with edge stretching on the input, not
- *	  the output
- *	- sets Xoffset / Yoffset
- * 21/4/08
- * 	- only rebuild the buffer offsets if bpl changes
- * 	- small cleanups
- */
-
-/*
-
-    This file is part of VIPS.
-    
-    VIPS is free software; you can redistribute it and/or modify
-    it under the terms of the GNU Lesser General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
- */
-
-/*
-
-    These files are distributed with VIPS - http://www.vips.ecs.soton.ac.uk
-
- */
-
-/*
-#define DEBUG
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif /*HAVE_CONFIG_H*/
-#include <vips/intl.h>
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <vips/vips.h>
-
-#ifdef WITH_DMALLOC
-#include <dmalloc.h>
-#endif /*WITH_DMALLOC*/
-
-/* Our sequence value.
- */
-typedef struct {
-	REGION *ir;		/* Input region */
-
-	int *soff;		/* Offsets we check for set */
-	int ss;			/* ... and number we check for set */
-	int *coff;		/* Offsets we check for clear */
-	int cs;			/* ... and number we check for clear */
-	int last_bpl;		/* Avoid recalcing offsets, if we can */
-} SeqInfo;
-
-/* Stop function.
- */
-static int
-dilate_stop( void *vseq, void *a, void *b )
-{
-	SeqInfo *seq = (SeqInfo *) vseq;
-
-	IM_FREEF( im_region_free, seq->ir );
-
-	return( 0 );
-}
-
-/* Start function.
- */
-static void *
-dilate_start( IMAGE *out, void *a, void *b )
-{
-	IMAGE *in = (IMAGE *) a;
-	INTMASK *msk = (INTMASK *) b;
-	int sz = msk->xsize * msk->ysize;
-	SeqInfo *seq;
-
-	if( !(seq = IM_NEW( out, SeqInfo )) )
-		return( NULL );
-
-	/* Init!
-	 */
-	seq->ir = NULL;
-	seq->soff = NULL;
-	seq->ss = 0;
-	seq->coff = NULL;
-	seq->cs = 0;
-	seq->last_bpl = -1;
-
-	/* Attach region and arrays.
-	 */
-	seq->ir = im_region_create( in );
-	seq->soff = IM_ARRAY( out, sz, int );
-	seq->coff = IM_ARRAY( out, sz, int );
-	if( !seq->ir || !seq->soff || !seq->coff ) {
-		dilate_stop( seq, in, NULL );
-		return( NULL );
-	}
-
-	return( seq );
-}
-
-/* Dilate!
- */
-static int
-dilate_gen( REGION *or, void *vseq, void *a, void *b )
-{
-	SeqInfo *seq = (SeqInfo *) vseq;
-	INTMASK *msk = (INTMASK *) b;
-	REGION *ir = seq->ir;
-
-	int *soff = seq->soff;
-	int *coff = seq->coff;
-
-	Rect *r = &or->valid;
-	Rect s;
-	int le = r->left;
-	int to = r->top;
-	int bo = IM_RECT_BOTTOM(r);
-	int sz = IM_REGION_N_ELEMENTS( or );
-
-	int *t;
-
-	int x, y;
-	int result, i;
-
-	/* Prepare the section of the input image we need. A little larger
-	 * than the section of the output image we are producing.
-	 */
-	s = *r;
-	s.width += msk->xsize - 1;
-	s.height += msk->ysize - 1;
-	if( im_prepare( ir, &s ) )
-		return( -1 );
-
-#ifdef DEBUG
-	printf( "erode_gen: preparing %dx%d pixels\n", s.width, s.height );
-#endif /*DEBUG*/
-
-	/* Scan mask, building offsets we check when processing. Only do this
-	 * if the bpl has changed since the previous im_prepare().
-	 */
-	if( seq->last_bpl != IM_REGION_LSKIP( ir ) ) {
-		seq->last_bpl = IM_REGION_LSKIP( ir );
-
-		seq->ss = 0;
-		seq->cs = 0;
-		for( t = msk->coeff, y = 0; y < msk->ysize; y++ )
-			for( x = 0; x < msk->xsize; x++, t++ )
-				switch( *t ) {
-				case 255:
-					soff[seq->ss++] = 
-						IM_REGION_ADDR( ir, 
-							x + le, y + to ) - 
-						IM_REGION_ADDR( ir, le, to );
-					break;
-
-				case 128:
-					break;
-
-				case 0:
-					coff[seq->cs++] = 
-						IM_REGION_ADDR( ir, 
-							x + le, y + to ) - 
-						IM_REGION_ADDR( ir, le, to );
-					break;
-
-				default:
-					im_error( "im_dilate", 
-						_( "bad mask element (%d "
-						"should be 0, 128 or 255)" ), 
-						*t );
-					return( -1 ); 
-				}
-	}
-
-	/* Dilate!
-	 */
-	for( y = to; y < bo; y++ ) {
-		PEL *p = (PEL *) IM_REGION_ADDR( ir, le, y );
-		PEL *q = (PEL *) IM_REGION_ADDR( or, le, y );
-
-		/* Loop along line.
-		 */
-		for( x = 0; x < sz; x++, q++, p++ ) {
-			/* Search for a hit on the set list.
-			 */
-			result = 0;
-			for( i = 0; i < seq->ss; i++ )
-				if( p[soff[i]] ) {
-					/* Found a match! 
-					 */
-					result = 255;
-					break;
-				}
-
-			/* No set pixels ... search for a hit in the clear
-			 * pixels.
-			 */
-			if( !result )
-				for( i = 0; i < seq->cs; i++ )
-					if( !p[coff[i]] ) {
-						/* Found a match! 
-						 */
-						result = 255;
-						break;
-					}
-
-			*q = result;
-
-		}
-	}
-
-	return( 0 );
-}
-
-/* Dilate an image.
- */
-int
-im_dilate_raw( IMAGE *in, IMAGE *out, INTMASK *m )
-{
-	INTMASK *msk;
-
-	/* Check mask has odd number of elements in width and height.
-	 */
-	if( m->xsize < 1 || !(m->xsize & 0x1) ||
-		m->ysize < 1 || !(m->ysize & 0x1) ) {
-		im_error( "im_dilate", "%s", _( "mask size not odd" ) ); 
-		return( -1 ); 
-	}
-
-	/* Standard checks.
-	 */
-	if( im_piocheck( in, out ) ) 
-		return( -1 ); 
-	if( in->Coding != IM_CODING_NONE || 
-		in->BandFmt != IM_BANDFMT_UCHAR ) {
-		im_error( "im_dilate", "%s", _( "uchar uncoded only" ) );
-		return( -1 );
-	}
-	if( im_cp_desc( out, in ) ) 
-		return( -1 ); 
-
-	/* Prepare output. Consider a 7x7 mask and a 7x7 image --- the output
-	 * would be 1x1.
-	 */
-	if( im_cp_desc( out, in ) )
-		return( -1 );
-	out->Xsize -= m->xsize - 1;
-	out->Ysize -= m->ysize - 1;
-	if( out->Xsize <= 0 || out->Ysize <= 0 ) {
-		im_error( "im_dilate", "%s", _( "image too small for mask" ) );
-		return( -1 );
-	}
-
-	/* Take a copy of m.
-	 */
-	if( !(msk = im_dup_imask( m, "conv_mask" )) )
-		return( -1 );
-	if( im_add_close_callback( out, 
-		(im_callback_fn) im_free_imask, msk, NULL ) ) {
-		im_free_imask( msk );
-		return( -1 );
-	}
-
-	/* Set demand hints. FATSTRIP is good for us, as THINSTRIP will cause
-	 * too many recalculations on overlaps.
-	 */
-	if( im_demand_hint( out, IM_FATSTRIP, in, NULL ) )
-		return( -1 );
-
-	/* Generate! 
-	 */
-	if( im_generate( out, dilate_start, dilate_gen, dilate_stop, in, msk ) )
-		return( -1 );
-
-	out->Xoffset = -m->xsize / 2;
-	out->Yoffset = -m->ysize / 2;
-
-	return( 0 );
-}
-
-
-/**
- * im_dilate:
- * @in: input image
- * @out: output image
- * @m: dilate mask
- *
- * Dilate an image with a mask.
- * The mask coefficients are either 255 (object) or 0 (black) or 128 (don't
- * care).
- * Input image are binary images with either 0 or 255 values, one channel 
- * only. The program dilates a white object on a black background.
- * The center of the mask is at location (m->xsize/2, m->ysize/2)
- * integer division. The mask is expected to have an odd width and
- * height.
-
-sets pixels in the output if
- *
- * See also: im_erode().
- *
- * Returns: 0 on success, -1 on error
- */
-int 
-im_dilate( IMAGE *in, IMAGE *out, INTMASK *m )
-{
-	IMAGE *t1 = im_open_local( out, "im_dilate:1", "p" );
-
-	if( !t1 || 
-		im_embed( in, t1, 1, m->xsize / 2, m->ysize / 2, 
-			in->Xsize + m->xsize - 1, 
-			in->Ysize + m->ysize - 1 ) ||
-		im_dilate_raw( t1, out, m ) )
-		return( -1 );
-
-	out->Xoffset = 0;
-	out->Yoffset = 0;
-
-	return( 0 );
-}
diff --git a/libvips/morphology/im_erode.c b/libvips/morphology/im_erode.c
deleted file mode 100644
index ccf983c3..00000000
--- a/libvips/morphology/im_erode.c
+++ /dev/null
@@ -1,325 +0,0 @@
-/* @(#) Function which erodes a binary VASARI format picture with a mask.
- * @(#) The mask coefficients are either 255 (object) or 0 (bk) or 128 (any).
- * @(#) Input image are binary images with either 0 or 255 values, one channel 
- * @(#) only. The program erodes a white object on a black background.
- * @(#) The center of the mask is at location (m->xsize/2, m->ysize/2)
- * @(#) integer division. The mask is expected to have an odd width and
- * @(#) height.
- * @(#)
- * @(#) int im_erode(in, out, m)
- * @(#) IMAGE *in, *out;
- * @(#) INTMASK *m;
- * @(#)
- * @(#) Returns either 0 (sucess) or -1 (fail)
- *
- * 19/9/95 JC
- *	- rewrite
- * 6/7/99 JC
- *	- checks and small tidies
- * 7/4/04 
- *	- now uses im_embed() with edge stretching on the input, not
- *	  the output
- *	- sets Xoffset / Yoffset
- * 21/4/08
- * 	- only rebuild the buffer offsets if bpl changes
- * 	- small cleanups
- */
-
-/*
-
-    This file is part of VIPS.
-    
-    VIPS is free software; you can redistribute it and/or modify
-    it under the terms of the GNU Lesser General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
- */
-
-/*
-
-    These files are distributed with VIPS - http://www.vips.ecs.soton.ac.uk
-
- */
-
-/*
-#define DEBUG
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif /*HAVE_CONFIG_H*/
-#include <vips/intl.h>
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <vips/vips.h>
-
-#ifdef WITH_DMALLOC
-#include <dmalloc.h>
-#endif /*WITH_DMALLOC*/
-
-/* Our sequence value.
- */
-typedef struct {
-	REGION *ir;		/* Input region */
-
-	int *soff;		/* Offsets we check for set */
-	int ss;			/* ... and number we check for set */
-	int *coff;		/* Offsets we check for clear */
-	int cs;			/* ... and number we check for clear */
-	int last_bpl;		/* Avoid recalcing offsets, if we can */
-} SeqInfo;
-
-/* Stop function.
- */
-static int
-erode_stop( void *vseq, void *a, void *b )
-{
-	SeqInfo *seq = (SeqInfo *) vseq;
-
-	IM_FREEF( im_region_free, seq->ir );
-
-	return( 0 );
-}
-
-/* Start function.
- */
-static void *
-erode_start( IMAGE *out, void *a, void *b )
-{
-	IMAGE *in = (IMAGE *) a;
-	INTMASK *msk = (INTMASK *) b;
-	SeqInfo *seq;
-	int sz = msk->xsize * msk->ysize;
-
-	if( !(seq = IM_NEW( out, SeqInfo )) )
-		return( NULL );
-
-	/* Init!
-	 */
-	seq->ir = NULL;
-	seq->soff = NULL;
-	seq->ss = 0;
-	seq->coff = NULL;
-	seq->cs = 0;
-	seq->last_bpl = -1;
-
-	/* Attach region and arrays.
-	 */
-	seq->ir = im_region_create( in );
-	seq->soff = IM_ARRAY( out, sz, int );
-	seq->coff = IM_ARRAY( out, sz, int );
-	if( !seq->ir || !seq->soff || !seq->coff ) {
-		erode_stop( seq, in, NULL );
-		return( NULL );
-	}
-
-	return( (void *) seq );
-}
-
-/* Erode!
- */
-static int
-erode_gen( REGION *or, void *vseq, void *a, void *b )
-{
-	SeqInfo *seq = (SeqInfo *) vseq;
-	INTMASK *msk = (INTMASK *) b;
-	REGION *ir = seq->ir;
-
-	int *soff = seq->soff;
-	int *coff = seq->coff;
-
-	Rect *r = &or->valid;
-	Rect s;
-	int le = r->left;
-	int to = r->top;
-	int bo = IM_RECT_BOTTOM(r);
-	int sz = IM_REGION_N_ELEMENTS( or );
-
-	int *t;
-
-	int x, y;
-	int result, i;
-
-	/* Prepare the section of the input image we need. A little larger
-	 * than the section of the output image we are producing.
-	 */
-	s = *r;
-	s.width += msk->xsize - 1;
-	s.height += msk->ysize - 1;
-	if( im_prepare( ir, &s ) )
-		return( -1 );
-
-#ifdef DEBUG
-	printf( "erode_gen: preparing %dx%d pixels\n", s.width, s.height );
-#endif /*DEBUG*/
-
-	/* Scan mask, building offsets we check when processing. Only do this
-	 * if the bpl has changed since the previous im_prepare().
-	 */
-	if( seq->last_bpl != IM_REGION_LSKIP( ir ) ) {
-		seq->last_bpl = IM_REGION_LSKIP( ir );
-
-		seq->ss = 0;
-		seq->cs = 0;
-		for( t = msk->coeff, y = 0; y < msk->ysize; y++ )
-			for( x = 0; x < msk->xsize; x++, t++ )
-				switch( *t ) {
-				case 255:
-					soff[seq->ss++] = 
-						IM_REGION_ADDR( ir, 
-							x + le, y + to ) - 
-						IM_REGION_ADDR( ir, le, to );
-					break;
-
-				case 128:
-					break;
-
-				case 0:
-					coff[seq->cs++] = 
-						IM_REGION_ADDR( ir, 
-							x + le, y + to ) - 
-						IM_REGION_ADDR( ir, le, to );
-					break;
-
-				default:
-					im_error( "im_erode", 
-						_( "bad mask element (%d "
-						"should be 0, 128 or 255)" ), 
-						*t );
-					return( -1 ); 
-				}
-	}
-
-	/* Erode!
-	 */
-	for( y = to; y < bo; y++ ) {
-		PEL *p = (PEL *) IM_REGION_ADDR( ir, le, y );
-		PEL *q = (PEL *) IM_REGION_ADDR( or, le, y );
-
-		/* Loop along line.
-		 */
-		for( x = 0; x < sz; x++, q++, p++ ) {
-			/* Check all set pixels are set.
-			 */
-			result = 255;
-			for( i = 0; i < seq->ss; i++ )
-				if( !p[soff[i]] ) {
-					/* Found a mismatch! 
-					 */
-					result = 0;
-					break;
-				}
-
-			/* Check all clear pixels are clear.
-			 */
-			if( result )
-				for( i = 0; i < seq->cs; i++ )
-					if( p[coff[i]] ) {
-						result = 0;
-						break;
-					}
-
-			*q = result;
-		}
-	}
-	
-	return( 0 );
-}
-
-/* Erode an image.
- */
-int
-im_erode_raw( IMAGE *in, IMAGE *out, INTMASK *m )
-{
-	INTMASK *msk;
-
-	/* Check mask has odd number of elements in width and height.
-	 */
-	if( m->xsize < 1 || !(m->xsize & 0x1) ||
-		m->ysize < 1 || !(m->ysize & 0x1) ) {
-		im_error( "im_erode", "%s", _( "mask size not odd" ) ); 
-		return( -1 ); 
-	}
-
-	/* Standard checks.
-	 */
-	if( im_piocheck( in, out ) ) 
-		return( -1 ); 
-	if( in->Coding != IM_CODING_NONE || 
-		in->BandFmt != IM_BANDFMT_UCHAR ) {
-		im_error( "im_erode", "%s", _( "1-band uchar uncoded only" ) );
-		return( -1 );
-	}
-	if( im_cp_desc( out, in ) ) 
-		return( -1 ); 
-
-	/* Prepare output. Consider a 7x7 mask and a 7x7 image --- the output
-	 * would be 1x1.
-	 */
-	if( im_cp_desc( out, in ) )
-		return( -1 );
-	out->Xsize -= m->xsize - 1;
-	out->Ysize -= m->ysize - 1;
-	if( out->Xsize <= 0 || out->Ysize <= 0 ) {
-		im_error( "im_erode", "%s", _( "image too small for mask" ) );
-		return( -1 );
-	}
-
-	/* Take a copy of m.
-	 */
-	if( !(msk = im_dup_imask( m, "conv_mask" )) )
-		return( -1 );
-	if( im_add_close_callback( out, 
-		(im_callback_fn) im_free_imask, msk, NULL ) ) {
-		im_free_imask( msk );
-		return( -1 );
-	}
-
-	/* Set demand hints. FATSTRIP is good for us, as THINSTRIP will cause
-	 * too many recalculations on overlaps.
-	 */
-	if( im_demand_hint( out, IM_FATSTRIP, in, NULL ) )
-		return( -1 );
-
-	/* Generate! 
-	 */
-	if( im_generate( out, erode_start, erode_gen, erode_stop, in, msk ) )
-		return( -1 );
-
-	out->Xoffset = -m->xsize / 2;
-	out->Yoffset = -m->ysize / 2;
-
-	return( 0 );
-}
-
-/* The above, with a border to make out the same size as in.
- */
-int 
-im_erode( IMAGE *in, IMAGE *out, INTMASK *m )
-{
-	IMAGE *t1 = im_open_local( out, "im_erode:1", "p" );
-
-	if( !t1 || 
-		im_embed( in, t1, 1, m->xsize / 2, m->ysize / 2, 
-			in->Xsize + m->xsize - 1, 
-			in->Ysize + m->ysize - 1 ) ||
-		im_erode_raw( t1, out, m ) )
-		return( -1 );
-
-	out->Xoffset = 0;
-	out->Yoffset = 0;
-
-	return( 0 );
-}
diff --git a/libvips/morphology/morphology.c b/libvips/morphology/morphology.c
new file mode 100644
index 00000000..64339355
--- /dev/null
+++ b/libvips/morphology/morphology.c
@@ -0,0 +1,823 @@
+/* morphological operators
+ *
+ * 19/9/95 JC
+ *	- rewritten
+ * 6/7/99 JC
+ *	- small tidies
+ * 7/4/04 
+ *	- now uses im_embed() with edge stretching on the input, not
+ *	  the output
+ *	- sets Xoffset / Yoffset
+ * 21/4/08
+ * 	- only rebuild the buffer offsets if bpl changes
+ * 	- small cleanups
+ * 25/10/10
+ * 	- start again from the Orc'd im_conv
+ * 29/10/10
+ * 	- use VipsVector
+ * 	- do erode as well 
+ */
+
+/*
+
+    This file is part of VIPS.
+    
+    VIPS is free software; you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+ */
+
+/*
+
+    These files are distributed with VIPS - http://www.vips.ecs.soton.ac.uk
+
+ */
+
+/*
+#define DEBUG
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif /*HAVE_CONFIG_H*/
+#include <vips/intl.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <limits.h>
+
+#include <vips/vips.h>
+#include <vips/vector.h>
+
+#ifdef WITH_DMALLOC
+#include <dmalloc.h>
+#endif /*WITH_DMALLOC*/
+
+/* The two operators we implement. They are more hit-miss, really.
+ */
+typedef enum {
+	ERODE,
+	DILATE
+} MorphOp;
+
+/* We can't run more than this many passes. Larger than this and we
+ * fall back to C.
+ */
+#define MAX_PASSES (10)
+
+/* A pass with a vector. 
+ */
+typedef struct {
+	int first;		/* The index of the first mask coff we use */
+	int last;		/* The index of the last mask coff we use */
+
+        /* The code we generate for this section of this mask. 
+	 */
+        VipsVector *vector;
+} Pass;
+
+/* Our parameters.
+ */
+typedef struct {
+	IMAGE *in;
+	IMAGE *out;
+	INTMASK *mask;		/* Copy of mask arg */
+	MorphOp op;
+
+	/* The passes we generate for this mask.
+	 */
+	int n_pass;	
+	Pass pass[MAX_PASSES];
+} Morph;
+
+static void
+pass_free( Morph *morph )
+{
+	int i;
+
+	for( i = 0; i < morph->n_pass; i++ )
+		IM_FREEF( vips_vector_free, morph->pass[i].vector );
+	morph->n_pass = 0;
+}
+
+static int
+morph_close( Morph *morph )
+{
+	IM_FREEF( im_free_imask, morph->mask );
+	pass_free( morph );
+
+        return( 0 );
+}
+
+#define TEMP( N, S ) vips_vector_temporary( v, N, S )
+#define SRC( N, P, S ) vips_vector_source( v, N, P, S )
+#define CONST( N, V, S ) vips_vector_constant( v, N, V, S )
+#define ASM2( OP, A, B ) vips_vector_asm2( v, OP, A, B )
+#define ASM3( OP, A, B, C ) vips_vector_asm3( v, OP, A, B, C )
+
+/* Generate code for a section of the mask. first is the index we start
+ * at, we set last to the index of the last one we use before we run 
+ * out of intermediates / constants / parameters / sources or mask
+ * coefficients.
+ *
+ * 0 for success, -1 on error.
+ */
+static int
+pass_compile_section( Morph *morph, int first, int *last )
+{
+	INTMASK *mask = morph->mask;
+	const int n_mask = mask->xsize * mask->ysize; 
+
+	Pass *pass;
+	VipsVector *v;
+	char offset[256];
+	char source[256];
+	char zero[256];
+	char one[256];
+	int i;
+
+	/* Skip any don't-care coefficients at the start of the mask region.
+	 */
+	for( ; mask->coeff[first] == 128 && first < n_mask; first++ )
+		;
+	if( first == n_mask )
+		return( 0 );
+
+	/* Allocate space for another pass.
+	 */
+	if( morph->n_pass == MAX_PASSES ) 
+		return( -1 );
+	pass = &morph->pass[morph->n_pass];
+	morph->n_pass += 1;
+	pass->first = first;
+
+	/* Start with a single source scanline, we add more as we need them.
+	 */
+	pass->vector = v = vips_vector_new_ds( "morph", 1, 1 );
+
+	/* The value we fetch from the image, 
+	 * the accumulated sum.
+	 */
+	TEMP( "value", 1 );
+	TEMP( "sum", 1 );
+
+	CONST( zero, 0, 1 );
+	CONST( one, 255, 1 );
+
+	/* Init the sum. If this is the first pass, it's a constant. If this
+	 * is a later pass, we have to init the sum from the result 
+	 * of the previous pass. 
+	 */
+	if( morph->n_pass == 1 ) {
+		if( morph->op == DILATE )
+			ASM2( "copyb", "sum", zero );
+		else
+			ASM2( "copyb", "sum", one );
+	}
+	else {
+		/* "r" is the result of the previous pass.
+		 */
+		vips_vector_source_name( v, "r", 1 );
+		ASM2( "loadb", "sum", "r" );
+	}
+
+	for( i = first; i < n_mask; i++ ) {
+		int x = i % mask->xsize;
+		int y = i / mask->xsize;
+
+		/* Exclude don't-care elements.
+		 */
+		if( mask->coeff[i] == 128 )
+			continue;
+
+		/* The source. s1 is the first scanline in the mask.
+		 */
+		vips_vector_source( v, source, y + 1, 1 );
+
+		/* The offset, only for non-first-columns though.
+		 */
+		if( x > 0 ) {
+			CONST( offset, morph->in->Bands * x, 1 );
+			ASM3( "loadoffb", "value", source, offset );
+		}
+		else
+			ASM2( "loadb", "value", source );
+
+		/* Join to our sum. If the mask element is zero, we have to
+		 * add an extra negate.
+		 */
+		if( morph->op == DILATE ) {
+			if( !mask->coeff[i] ) 
+				ASM3( "xorb", "value", "value", one );
+			ASM3( "orb", "sum", "sum", "value" );
+		}
+		else {
+			if( !mask->coeff[i] ) 
+				ASM3( "andnb", "sum", "sum", "value" );
+			else
+				ASM3( "andb", "sum", "sum", "value" );
+		}
+
+		if( vips_vector_full( v ) )
+			break;
+	}
+
+	pass->last = i;
+	*last = i;
+
+	ASM2( "copyb", "d1", "sum" );
+
+	if( !vips_vector_compile( v ) ) 
+		return( -1 );
+
+#ifdef DEBUG
+	printf( "done matrix coeffs %d to %d\n", pass->first, pass->last );
+	vips_vector_print( v );
+#endif /*DEBUG*/
+
+	return( 0 );
+}
+
+/* Generate a set of passes.
+ */
+static int
+pass_compile( Morph *morph )
+{
+	INTMASK *mask = morph->mask;
+	const int n_mask = mask->xsize * mask->ysize; 
+
+	int i;
+
+#ifdef DEBUG
+	printf( "morph: generating vector code\n" );
+#endif /*DEBUG*/
+
+	/* Generate passes until we've used up the whole mask.
+	 */
+	for( i = 0;;) {
+		int last;
+
+		if( pass_compile_section( morph, i, &last ) ) 
+			return( -1 );
+		i = last + 1;
+
+		if( i >= n_mask )
+			break;
+	}
+
+	return( 0 );
+}
+
+static Morph *
+morph_new( IMAGE *in, IMAGE *out, INTMASK *mask, MorphOp op )
+{
+	const int n_mask = mask->xsize * mask->ysize; 
+
+        Morph *morph;
+        int i;
+
+	if( im_piocheck( in, out ) ||
+		im_check_uncoded( "morph", in ) ||
+		im_check_format( "morph", in, IM_BANDFMT_UCHAR ) ||
+		im_check_imask( "morph", mask ) ) 
+		return( NULL );
+	for( i = 0; i < n_mask; i++ )
+		if( mask->coeff[i] != 0 && 
+			mask->coeff[i] != 128 &&
+			mask->coeff[i] != 255 ) {
+			im_error( "morph", 
+				_( "bad mask element (%d "
+				"should be 0, 128 or 255)" ), 
+				mask->coeff[i] );
+			return( NULL );
+		}
+
+        if( !(morph = IM_NEW( out, Morph )) )
+                return( NULL );
+
+        morph->in = in;
+        morph->out = out;
+        morph->mask = NULL;
+        morph->op = op;
+
+        morph->n_pass = 0;
+	for( i = 0; i < MAX_PASSES; i++ )
+		morph->pass[i].vector = NULL;
+
+        if( im_add_close_callback( out, 
+		(im_callback_fn) morph_close, morph, NULL ) ||
+        	!(morph->mask = im_dup_imask( mask, "morph" )) )
+                return( NULL );
+
+	/* Generate code for this mask / image, if possible.
+	 */
+	if( vips_vector_get_enabled() ) {
+		if( pass_compile( morph ) )
+			pass_free( morph );
+	}
+
+        return( morph );
+}
+
+/* Our sequence value.
+ */
+typedef struct {
+	Morph *morph;
+	REGION *ir;		/* Input region */
+
+	int *soff;		/* Offsets we check for set */
+	int ss;			/* ... and number we check for set */
+	int *coff;		/* Offsets we check for clear */
+	int cs;			/* ... and number we check for clear */
+
+	int last_bpl;		/* Avoid recalcing offsets, if we can */
+
+	/* In vector mode we need a pair of intermediate buffers to keep the 
+	 * results of each pass in.
+	 */
+	void *t1;
+	void *t2;
+} MorphSequence;
+
+/* Free a sequence value.
+ */
+static int
+morph_stop( void *vseq, void *a, void *b )
+{
+	MorphSequence *seq = (MorphSequence *) vseq;
+
+	IM_FREEF( im_region_free, seq->ir );
+	IM_FREE( seq->t1 );
+	IM_FREE( seq->t2 );
+
+	return( 0 );
+}
+
+/* Morph start function.
+ */
+static void *
+morph_start( IMAGE *out, void *a, void *b )
+{
+	IMAGE *in = (IMAGE *) a;
+	Morph *morph = (Morph *) b;
+	int n_mask = morph->mask->xsize * morph->mask->ysize;
+	int sz = IM_IMAGE_N_ELEMENTS( in );
+
+	MorphSequence *seq;
+
+	if( !(seq = IM_NEW( out, MorphSequence )) )
+		return( NULL );
+
+	/* Init!
+	 */
+	seq->morph = morph;
+	seq->ir = NULL;
+	seq->soff = NULL;
+	seq->ss = 0;
+	seq->coff = NULL;
+	seq->cs = 0;
+	seq->last_bpl = -1;
+	seq->t1 = NULL;
+	seq->t2 = NULL;
+
+	/* Attach region and arrays.
+	 */
+	seq->ir = im_region_create( in );
+	seq->soff = IM_ARRAY( out, n_mask, int );
+	seq->coff = IM_ARRAY( out, n_mask, int );
+	seq->t1 = IM_ARRAY( NULL, sz, PEL );
+	seq->t2 = IM_ARRAY( NULL, sz, PEL );
+	if( !seq->ir || !seq->soff || !seq->coff || !seq->t1 || !seq->t2  ) {
+		morph_stop( seq, in, NULL );
+		return( NULL );
+	}
+
+	return( seq );
+}
+
+/* Dilate!
+ */
+static int
+dilate_gen( REGION *or, void *vseq, void *a, void *b )
+{
+	MorphSequence *seq = (MorphSequence *) vseq;
+	Morph *morph = (Morph *) b;
+	INTMASK *mask = morph->mask;
+	REGION *ir = seq->ir;
+
+	int *soff = seq->soff;
+	int *coff = seq->coff;
+
+	Rect *r = &or->valid;
+	Rect s;
+	int le = r->left;
+	int to = r->top;
+	int bo = IM_RECT_BOTTOM( r );
+	int sz = IM_REGION_N_ELEMENTS( or );
+
+	int *t;
+
+	int x, y;
+	int result, i;
+
+	/* Prepare the section of the input image we need. A little larger
+	 * than the section of the output image we are producing.
+	 */
+	s = *r;
+	s.width += mask->xsize - 1;
+	s.height += mask->ysize - 1;
+	if( im_prepare( ir, &s ) )
+		return( -1 );
+
+	/* Scan mask, building offsets we check when processing. Only do this
+	 * if the bpl has changed since the previous im_prepare().
+	 */
+	if( seq->last_bpl != IM_REGION_LSKIP( ir ) ) {
+		seq->last_bpl = IM_REGION_LSKIP( ir );
+
+		seq->ss = 0;
+		seq->cs = 0;
+		for( t = mask->coeff, y = 0; y < mask->ysize; y++ )
+			for( x = 0; x < mask->xsize; x++, t++ )
+				switch( *t ) {
+				case 255:
+					soff[seq->ss++] = 
+						IM_REGION_ADDR( ir, 
+							x + le, y + to ) - 
+						IM_REGION_ADDR( ir, le, to );
+					break;
+
+				case 128:
+					break;
+
+				case 0:
+					coff[seq->cs++] = 
+						IM_REGION_ADDR( ir, 
+							x + le, y + to ) - 
+						IM_REGION_ADDR( ir, le, to );
+					break;
+
+				default:
+					g_assert( 0 );
+				}
+	}
+
+	/* Dilate!
+	 */
+	for( y = to; y < bo; y++ ) {
+		PEL *p = (PEL *) IM_REGION_ADDR( ir, le, y );
+		PEL *q = (PEL *) IM_REGION_ADDR( or, le, y );
+
+		/* Loop along line.
+		 */
+		for( x = 0; x < sz; x++, q++, p++ ) {
+			/* Search for a hit on the set list.
+			 */
+			result = 0;
+			for( i = 0; i < seq->ss; i++ )
+				if( p[soff[i]] ) {
+					/* Found a match! 
+					 */
+					result = 255;
+					break;
+				}
+
+			/* No set pixels ... search for a hit in the clear
+			 * pixels.
+			 */
+			if( !result )
+				for( i = 0; i < seq->cs; i++ )
+					if( !p[coff[i]] ) {
+						/* Found a match! 
+						 */
+						result = 255;
+						break;
+					}
+
+			*q = result;
+
+		}
+	}
+
+	return( 0 );
+}
+
+/* Erode!
+ */
+static int
+erode_gen( REGION *or, void *vseq, void *a, void *b )
+{
+	MorphSequence *seq = (MorphSequence *) vseq;
+	INTMASK *msk = (INTMASK *) b;
+	REGION *ir = seq->ir;
+
+	int *soff = seq->soff;
+	int *coff = seq->coff;
+
+	Rect *r = &or->valid;
+	Rect s;
+	int le = r->left;
+	int to = r->top;
+	int bo = IM_RECT_BOTTOM(r);
+	int sz = IM_REGION_N_ELEMENTS( or );
+
+	int *t;
+
+	int x, y;
+	int result, i;
+
+	/* Prepare the section of the input image we need. A little larger
+	 * than the section of the output image we are producing.
+	 */
+	s = *r;
+	s.width += msk->xsize - 1;
+	s.height += msk->ysize - 1;
+	if( im_prepare( ir, &s ) )
+		return( -1 );
+
+#ifdef DEBUG
+	printf( "erode_gen: preparing %dx%d pixels\n", s.width, s.height );
+#endif /*DEBUG*/
+
+	/* Scan mask, building offsets we check when processing. Only do this
+	 * if the bpl has changed since the previous im_prepare().
+	 */
+	if( seq->last_bpl != IM_REGION_LSKIP( ir ) ) {
+		seq->last_bpl = IM_REGION_LSKIP( ir );
+
+		seq->ss = 0;
+		seq->cs = 0;
+		for( t = msk->coeff, y = 0; y < msk->ysize; y++ )
+			for( x = 0; x < msk->xsize; x++, t++ )
+				switch( *t ) {
+				case 255:
+					soff[seq->ss++] = 
+						IM_REGION_ADDR( ir, 
+							x + le, y + to ) - 
+						IM_REGION_ADDR( ir, le, to );
+					break;
+
+				case 128:
+					break;
+
+				case 0:
+					coff[seq->cs++] = 
+						IM_REGION_ADDR( ir, 
+							x + le, y + to ) - 
+						IM_REGION_ADDR( ir, le, to );
+					break;
+
+				default:
+					g_assert( 0 );
+				}
+	}
+
+	/* Erode!
+	 */
+	for( y = to; y < bo; y++ ) {
+		PEL *p = (PEL *) IM_REGION_ADDR( ir, le, y );
+		PEL *q = (PEL *) IM_REGION_ADDR( or, le, y );
+
+		/* Loop along line.
+		 */
+		for( x = 0; x < sz; x++, q++, p++ ) {
+			/* Check all set pixels are set.
+			 */
+			result = 255;
+			for( i = 0; i < seq->ss; i++ )
+				if( !p[soff[i]] ) {
+					/* Found a mismatch! 
+					 */
+					result = 0;
+					break;
+				}
+
+			/* Check all clear pixels are clear.
+			 */
+			if( result )
+				for( i = 0; i < seq->cs; i++ )
+					if( p[coff[i]] ) {
+						result = 0;
+						break;
+					}
+
+			*q = result;
+		}
+	}
+	
+	return( 0 );
+}
+
+static void
+pass_run( Morph *morph, Pass *pass, VipsExecutor *executor, 
+	REGION *ir, void *t1, void *t2, int x, int y )
+{
+	INTMASK *mask = morph->mask;
+	int top = pass->first / mask->xsize; 
+	int bottom = pass->last / mask->xsize; 
+
+	PEL *p = (PEL *) IM_REGION_ADDR( ir, x, y );
+	int lsk = IM_REGION_LSKIP( ir );
+
+	int i;
+
+	/* Generate all the scanline pointers this prog needs.
+	 */
+	for( i = top; i <= bottom; i++ ) 
+		vips_executor_set_source( executor, i + 1, p + i * lsk ); 
+
+	/* It might need the result from a previous pass.
+	 */
+	vips_executor_set_array( executor, "r", t1 );
+
+	vips_executor_set_array( executor, "d1", t2 );
+
+	vips_executor_run( executor );
+}
+
+/* The vector codepath.
+ */
+static int
+morph_vector_gen( REGION *or, void *vseq, void *a, void *b )
+{
+	MorphSequence *seq = (MorphSequence *) vseq;
+	Morph *morph = (Morph *) b;
+	INTMASK *mask = morph->mask;
+	REGION *ir = seq->ir;
+	Rect *r = &or->valid;
+	int sz = IM_REGION_N_ELEMENTS( or );
+
+	Rect s;
+	int y, j;
+	VipsExecutor executor[MAX_PASSES];
+
+	/* Prepare the section of the input image we need. A little larger
+	 * than the section of the output image we are producing.
+	 */
+	s = *r;
+	s.width += mask->xsize - 1;
+	s.height += mask->ysize - 1;
+	if( im_prepare( ir, &s ) )
+		return( -1 );
+
+	for( j = 0; j < morph->n_pass; j++ ) 
+		vips_executor_set_program( &executor[j], 
+			morph->pass[j].vector, sz );
+
+	for( y = 0; y < r->height; y++ ) { 
+		for( j = 0; j < morph->n_pass; j++ ) {
+			void *d;
+
+			/* The last pass goes to the output image,
+			 * intermediate passes go to t2.
+			 */
+			if( j == morph->n_pass - 1 )
+				d = IM_REGION_ADDR( or, r->left, r->top + y );
+			else 
+				d = seq->t2;
+
+			pass_run( morph, &morph->pass[j], &executor[j], 
+				ir, seq->t1, d, r->left, r->top + y );
+
+			IM_SWAP( void *, seq->t1, seq->t2 );
+		}
+	}
+
+	return( 0 );
+}
+
+/* Morph an image.
+ */
+static int
+morphology( IMAGE *in, IMAGE *out, INTMASK *mask, MorphOp op )
+{
+	Morph *morph;
+	im_generate_fn generate;
+
+	/* Check parameters.
+	 */
+	if( !(morph = morph_new( in, out, mask, op )) )
+		return( -1 );
+
+	/* Prepare output. Consider a 7x7 mask and a 7x7 image --- the output
+	 * would be 1x1.
+	 */
+	if( im_cp_desc( out, in ) )
+		return( -1 );
+	out->Xsize -= mask->xsize - 1;
+	out->Ysize -= mask->ysize - 1;
+	if( out->Xsize <= 0 || out->Ysize <= 0 ) {
+		im_error( "morph", "%s", _( "image too small for mask" ) );
+		return( -1 );
+	}
+
+	if( morph->n_pass ) {
+		generate = morph_vector_gen;
+
+#ifdef DEBUG
+		printf( "morph_vector_gen: %d passes\n", morph->n_pass );
+#endif /*DEBUG*/
+	}
+	else if( op == DILATE )
+		generate = dilate_gen;
+	else
+		generate = erode_gen;
+
+	/* Set demand hints. FATSTRIP is good for us, as THINSTRIP will cause
+	 * too many recalculations on overlaps.
+	 */
+	if( im_demand_hint( out, IM_FATSTRIP, in, NULL ) ||
+		im_generate( out, 
+			morph_start, generate, morph_stop, in, morph ) )
+		return( -1 );
+
+	out->Xoffset = -mask->xsize / 2;
+	out->Yoffset = -mask->ysize / 2;
+
+	return( 0 );
+}
+
+int
+im_dilate_raw( IMAGE *in, IMAGE *out, INTMASK *mask )
+{
+	return( morphology( in, out, mask, DILATE ) );
+}
+
+int
+im_erode_raw( IMAGE *in, IMAGE *out, INTMASK *mask )
+{
+	return( morphology( in, out, mask, ERODE ) );
+}
+
+/**
+ * im_dilate:
+ * @in: input image
+ * @out: output image
+ * @mask: mask
+ *
+ * Operations are performed using the processor's vector unit,
+ * if possible. Disable this with --vips-novector or IM_NOVECTOR.
+ *
+ * See also: 
+ *
+ * Returns: 0 on success, -1 on error
+ */
+int 
+im_dilate( IMAGE *in, IMAGE *out, INTMASK *mask )
+{
+	IMAGE *t1 = im_open_local( out, "im_dilate:1", "p" );
+
+	if( !t1 || 
+		im_embed( in, t1, 1, mask->xsize / 2, mask->ysize / 2, 
+			in->Xsize + mask->xsize - 1, 
+			in->Ysize + mask->ysize - 1 ) ||
+		morphology( in, out, mask, DILATE ) )
+		return( -1 );
+
+	out->Xoffset = 0;
+	out->Yoffset = 0;
+
+	return( 0 );
+}
+
+/**
+ * im_erode:
+ * @in: input image
+ * @out: output image
+ * @mask: mask
+ *
+ * Operations are performed using the processor's vector unit,
+ * if possible. Disable this with --vips-novector or IM_NOVECTOR.
+ *
+ * See also: 
+ *
+ * Returns: 0 on success, -1 on error
+ */
+int 
+im_erode( IMAGE *in, IMAGE *out, INTMASK *mask )
+{
+	IMAGE *t1 = im_open_local( out, "im_erode:1", "p" );
+
+	if( !t1 || 
+		im_embed( in, t1, 1, mask->xsize / 2, mask->ysize / 2, 
+			in->Xsize + mask->xsize - 1, 
+			in->Ysize + mask->ysize - 1 ) ||
+		morphology( in, out, mask, ERODE ) )
+		return( -1 );
+
+	out->Xoffset = 0;
+	out->Yoffset = 0;
+
+	return( 0 );
+}
diff --git a/libvips/mosaicing/im_improve.c b/libvips/mosaicing/im_improve.c
index 3b1ca9a7..a9907e80 100644
--- a/libvips/mosaicing/im_improve.c
+++ b/libvips/mosaicing/im_improve.c
@@ -156,8 +156,6 @@ copydevpoints( TIE_POINTS *pnew, TIE_POINTS *pold )
 	return( 0 );
 }
 
-#define SWAP( A, B ) { void *t = (A); A = B; B = t; }
-
 int 
 im__improve( TIE_POINTS *inpoints, TIE_POINTS *outpoints )
 {
@@ -182,7 +180,7 @@ im__improve( TIE_POINTS *inpoints, TIE_POINTS *outpoints )
 
 		/* And loop.
 		 */
-		SWAP( p, q );
+		IM_SWAP( void *, p, q );
 	}
 
 	/* q has the output - copy to outpoints.