diff --git a/ChangeLog b/ChangeLog
index 2188a233..e7857aea 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -27,10 +27,10 @@
 - better quality for vips_resize() with linear/cubic kernels 
 - pyvips8 can create new metadata
 - better upsizing with vips_resize()
-- added vips_convf(), vips_convi(), vips_convasep() ... im_conv*() functions 
-  rewritten as classes
+- added vips_convf(), vips_convi(), vips_convasep(), vips_conva() ... 
+  im_conv*() functions rewritten as classes
 - vips_convsep() calls vips_convasep() for the approximate case
-- new fixed-point vector path for convi is up to 2x faster
+- new fixed-point vector path for convi is up to about 2x faster
 - added vips_worley(), generate Worley noise
 - added vips_perlin(), generate Perlin noise
 - gif loader can write 1, 2, 3, or 4 bands depending on file contents
diff --git a/TODO b/TODO
index d7c52a41..2743790b 100644
--- a/TODO
+++ b/TODO
@@ -1,5 +1,7 @@
 - tests for convasep ... just check it matches convsep +/- some small amount
 
+- tests for conva
+
 - try this blur.mat
 
 	6 1 3896 
diff --git a/libvips/convolution/Makefile.am b/libvips/convolution/Makefile.am
index dab3e2d8..b2abd237 100644
--- a/libvips/convolution/Makefile.am
+++ b/libvips/convolution/Makefile.am
@@ -6,6 +6,7 @@ libconvolution_la_SOURCES = \
 	correlation.c \
 	correlation.h \
 	conv.c \
+	conva.c \
 	convf.c \
 	convi.c \
 	convasep.c \
diff --git a/libvips/convolution/conva.c b/libvips/convolution/conva.c
index 3f1c36c6..5ab55e45 100644
--- a/libvips/convolution/conva.c
+++ b/libvips/convolution/conva.c
@@ -72,12 +72,16 @@ $ vips im_max abs.v
 
   	- are we handling mask offset correctly?
 
+	- could we do better with an h and a v cumulativization image? we might 
+	not need the huge intermediate we have now, since any line sum an be 
+	found with simple indexing
+
  */
 
 /*
- */
 #define DEBUG
 #define VIPS_DEBUG
+ */
 
 #ifdef HAVE_CONFIG_H
 #include <config.h>
@@ -93,10 +97,14 @@ $ vips im_max abs.v
 #include <vips/vips.h>
 #include <vips/vector.h>
 #include <vips/debug.h>
+#include <vips/internal.h>
 
-/* Maximum number of boxes we can break the mask into.
+#include "pconvolution.h"
+
+/* Maximum number of boxes we can break the mask into. Don't have this too
+ * high, it'll make the instance huge, and gobject has a 64kb limit. 
  */
-#define MAX_LINES (10000)
+#define MAX_LINES (1000)
 
 /* The number of edges we consider at once in clustering. Higher values are
  * faster, but risk pushing up average error in the result.
@@ -167,8 +175,6 @@ typedef struct {
 	int area;
 	int rounding;
 	int offset;
-	int width;
-	int height;
 
 	/* The horizontal lines we gather. hline[3] writes to band 3 in the
 	 * intermediate image. max_line is the length of the longest hline:
@@ -259,7 +265,7 @@ vips_conva_hprint( VipsConva *conva )
 			conva->velement[y].factor,
 			conva->hline[b].weight );
 		for( x = 0; x < 45; x++ ) {
-			int rx = x * (conva->width + 1) / 45;
+			int rx = x * (conva->iM->Xsize + 1) / 45;
 
 			if( rx >= conva->hline[b].start && 
 				rx < conva->hline[b].end )
@@ -300,6 +306,7 @@ vips_conva_decompose_lines( VipsConva *conva )
 {
 	VipsImage *iM = conva->iM;
 	const int size = iM->Xsize * iM->Ysize;
+	double *coeff = VIPS_MATRIX( iM, 0, 0 ); 
 
 	double max;
 	double min;
@@ -355,14 +362,14 @@ vips_conva_decompose_lines( VipsConva *conva )
 			inside = 0;
 
 			for( x = 0; x < iM->Xsize; x++ ) {
-				double coeff = *VIPS_MATRIX( iM, x, y );
+				double c = coeff[x + y * iM->Xsize];
 
 				/* The vertical line from mask[x, y] to 0 is 
 				 * inside. Is our current square (x, y) part 
 				 * of that line?
 				 */
-				if( (z_positive && coeff >= z_ph) ||
-					(!z_positive && coeff <= z_ph) ) {
+				if( (z_positive && c >= z_ph) ||
+					(!z_positive && c <= z_ph) ) {
 					if( !inside ) {
 						vips_conva_hline_start( conva,
 							x );
@@ -382,7 +389,7 @@ vips_conva_decompose_lines( VipsConva *conva )
 
 			if( inside && 
 				vips_conva_hline_end( conva, 
-					mask->xsize, y, z_positive ? 1 : -1 ) )
+					iM->Xsize, y, z_positive ? 1 : -1 ) )
 				return( -1 );
 		}
 	}
@@ -480,7 +487,7 @@ vips_conva_cluster2( VipsConva *conva )
 			if( conva->hline[j].weight == 0 )
 				continue;
 
-			distance = vips_conva_distance( boxes, i, j ); 
+			distance = vips_conva_distance( conva, i, j ); 
 			if( distance < worst ) {
 				conva->edge[worst_i].a = i;
 				conva->edge[worst_i].b = j;
@@ -672,7 +679,7 @@ vips_conva_decompose_boxes( VipsConva *conva )
 {
 	VipsObjectClass *class = VIPS_OBJECT_GET_CLASS( conva );
 	VipsImage *iM = conva->iM;
-	double *coeff = VIPS_MATRIX( im, 0, 0 ); 
+	double *coeff = VIPS_MATRIX( iM, 0, 0 ); 
 	const int size = iM->Xsize * iM->Ysize;
 	double scale = vips_image_get_scale( iM ); 
 	double offset = vips_image_get_offset( iM ); 
@@ -680,18 +687,15 @@ vips_conva_decompose_boxes( VipsConva *conva )
 	double sum;
 	int x, y, z;
 
-	boxes->n_hline = 0;
-	boxes->n_velement = 0;
-	boxes->n_vline = 0;
-
 	/* Break into a set of hlines.
 	 */
-	if( vips_conva_decompose_lines( boxes ) )
+	if( vips_conva_decompose_lines( conva ) )
 		return( -1 );
 
 	/* Cluster to find groups of lines.
 	 */
-	VIPS_DEBUG_MSG( "boxes_new: clustering with thresh %d ...\n", cluster );
+	VIPS_DEBUG_MSG( "vips_conva_decompose_boxes: "
+		"clustering with thresh %d ...\n", conva->cluster );
 	while( vips_conva_cluster2( conva ) )
 		;
 
@@ -734,7 +738,7 @@ vips_conva_decompose_boxes( VipsConva *conva )
 		sum += coeff[z];
 
 	conva->area = VIPS_RINT( sum * conva->area / scale );
-	conva->rounding = (conva->area + 1) / 2 + offset * conva->area;
+	conva->rounding = (conva->area + 1) / 2;
 	conva->offset = offset;
 
 #ifdef DEBUG
@@ -795,9 +799,7 @@ vips_conva_start( VipsImage *out, void *a, void *b )
 
 	VipsConvaSeq *seq;
 
-	if( !(seq = VIPS_NEW( out, VipsConvaSeq )) )
-		return( NULL );
-
+	seq = VIPS_NEW( out, VipsConvaSeq );
 	seq->conva = conva;
 	seq->ir = vips_region_new( in );
 
@@ -815,11 +817,6 @@ vips_conva_start( VipsImage *out, void *a, void *b )
 		seq->sum = VIPS_ARRAY( out, conva->n_velement, double );
 	seq->last_stride = -1;
 
-	if( !seq->ir || !seq->start || !seq->end || !seq->sum ) {
-		vips_conva_stop( seq, in, boxes );
-		return( NULL );
-	}
-
 	return( seq );
 }
 
@@ -863,7 +860,8 @@ G_STMT_START { \
 /* Do horizontal masks ... we scan the mask along scanlines.
  */
 static int
-vips_conva_hgenerate( VipsRegion *or, void *vseq, void *a, void *b )
+vips_conva_hgenerate( VipsRegion *or, void *vseq, 
+	void *a, void *b, gboolean *stop )
 {
 	VipsConvaSeq *seq = (VipsConvaSeq *) vseq;
 	VipsImage *in = (VipsImage *) a;
@@ -888,7 +886,7 @@ vips_conva_hgenerate( VipsRegion *or, void *vseq, void *a, void *b )
 	 * than the section of the output image we are producing.
 	 */
 	s = *r;
-	s.width += mask->xsize - 1;
+	s.width += iM->Xsize - 1;
 	if( vips_region_prepare( ir, &s ) )
 		return( -1 );
 
@@ -967,7 +965,7 @@ vips_conva_hgenerate( VipsRegion *or, void *vseq, void *a, void *b )
 static int
 vips_conva_horizontal( VipsConva *conva, VipsImage *in, VipsImage **out )
 {
-	VipsObjectClass *class = VIPS_OBJECT_GET_CLASS( convasep );
+	VipsObjectClass *class = VIPS_OBJECT_GET_CLASS( conva );
 
 	/* Prepare output. Consider a 7x7 mask and a 7x7 image --- the output
 	 * would be 1x1.
@@ -1083,11 +1081,13 @@ G_STMT_START { \
 /* Do vertical masks ... we scan the mask down columns of pixels. 
  */
 static int
-vips_conva_vgenerate( VipsRegion *or, void *vseq, void *a, void *b )
+vips_conva_vgenerate( VipsRegion *or, void *vseq, 
+	void *a, void *b, gboolean *stop )
 {
 	VipsConvaSeq *seq = (VipsConvaSeq *) vseq;
 	VipsImage *in = (VipsImage *) a;
 	VipsConva *conva = (VipsConva *) b;
+	VipsConvolution *convolution = (VipsConvolution *) conva;
 
 	VipsRegion *ir = seq->ir;
 	const int n_vline = conva->n_vline;
@@ -1108,14 +1108,14 @@ vips_conva_vgenerate( VipsRegion *or, void *vseq, void *a, void *b )
 	 * than the section of the output image we are producing.
 	 */
 	s = *r;
-	s.height += mask->ysize - 1;
+	s.height += iM->Ysize - 1;
 	if( vips_region_prepare( ir, &s ) )
 		return( -1 );
 
 	istride = VIPS_REGION_LSKIP( ir ) / 
 		VIPS_IMAGE_SIZEOF_ELEMENT( in );
 	ostride = VIPS_REGION_LSKIP( or ) / 
-		VIPS_IMAGE_SIZEOF_ELEMENT( boxes->out );
+		VIPS_IMAGE_SIZEOF_ELEMENT( convolution->out );
 
         /* Init offset array. 
 	 */
@@ -1130,7 +1130,7 @@ vips_conva_vgenerate( VipsRegion *or, void *vseq, void *a, void *b )
 		}
 	}
 
-	switch( conva->in->BandFmt ) {
+	switch( convolution->in->BandFmt ) {
 	case VIPS_FORMAT_UCHAR: 	
 		if( conva->max_line > 256 )
 			VCONV( unsigned int, \
@@ -1143,10 +1143,10 @@ vips_conva_vgenerate( VipsRegion *or, void *vseq, void *a, void *b )
 	case VIPS_FORMAT_CHAR: 	
 		if( conva->max_line > 256 )
 			VCONV( signed int, \
-				signed int, signed char, CLIP_UCHAR );
+				signed int, signed char, CLIP_CHAR );
 		else
 			VCONV( signed int, \
-				signed short, signed char, CLIP_UCHAR );
+				signed short, signed char, CLIP_CHAR );
 		break;
 
 	case VIPS_FORMAT_USHORT: 	
@@ -1192,7 +1192,8 @@ vips_conva_vgenerate( VipsRegion *or, void *vseq, void *a, void *b )
 static int
 vips_conva_vertical( VipsConva *conva, VipsImage *in, VipsImage **out )
 {
-	VipsObjectClass *class = VIPS_OBJECT_GET_CLASS( convasep );
+	VipsObjectClass *class = VIPS_OBJECT_GET_CLASS( conva );
+	VipsConvolution *convolution = (VipsConvolution *) conva;
 
 	/* Prepare output. Consider a 7x7 mask and a 7x7 image --- the output
 	 * would be 1x1.
@@ -1208,12 +1209,12 @@ vips_conva_vertical( VipsConva *conva, VipsImage *in, VipsImage **out )
 			"%s", _( "image too small for mask" ) );
 		return( -1 );
 	}
-	out->Bands = boxes->in->Bands;
-	out->BandFmt = boxes->in->BandFmt;
+	(*out)->Bands = convolution->in->Bands;
+	(*out)->BandFmt = convolution->in->BandFmt;
 
-	if( vips_image_generate( out, 
+	if( vips_image_generate( *out, 
 		vips_conva_start, vips_conva_vgenerate, vips_conva_stop, 
-		in, boxes ) )
+		in, conva ) )
 		return( -1 );
 
 	return( 0 );
@@ -1222,7 +1223,6 @@ vips_conva_vertical( VipsConva *conva, VipsImage *in, VipsImage **out )
 static int
 vips_conva_build( VipsObject *object )
 {
-	VipsObjectClass *class = VIPS_OBJECT_GET_CLASS( object );
 	VipsConvolution *convolution = (VipsConvolution *) object;
 	VipsConva *conva = (VipsConva *) object;
 	VipsImage **t = (VipsImage **) vips_object_local_array( object, 4 );
@@ -1237,8 +1237,6 @@ vips_conva_build( VipsObject *object )
 	if( vips__image_intize( convolution->M, &t[0] ) )
 		return( -1 );
 	conva->iM = t[0]; 
-	conva->width = conva->iM->Xsize;
-	conva->height = conva->iM->Ysize;
 
 	in = convolution->in;
 
@@ -1248,14 +1246,14 @@ vips_conva_build( VipsObject *object )
 	g_object_set( conva, "out", vips_image_new(), NULL ); 
 	if( 
 		vips_embed( in, &t[1], 
-			conva->width / 2, 
-			conva->height / 2, 
-			in->Xsize + convasep->width - 1, 
-			in->Ysize + convasep->height - 1,
+			t[0]->Xsize / 2, 
+			t[0]->Ysize / 2, 
+			in->Xsize + t[0]->Xsize - 1, 
+			in->Ysize + t[0]->Ysize - 1,
 			"extend", VIPS_EXTEND_COPY,
 			NULL ) ||
-		vips_conva_horizontal( convasep, t[1], &t[2] ) ||
-		vips_conva_vertical( convasep, t[2], &t[3] ) ||
+		vips_conva_horizontal( conva, t[1], &t[2] ) ||
+		vips_conva_vertical( conva, t[2], &t[3] ) ||
 		vips_image_write( t[3], convolution->out ) )
 		return( -1 );
 
@@ -1275,7 +1273,7 @@ vips_conva_class_init( VipsConvaClass *class )
 	gobject_class->get_property = vips_object_get_property;
 
 	object_class->nickname = "conva";
-	object_class->description = _( "approximate convolution" );
+	object_class->description = _( "approximate integer convolution" );
 	object_class->build = vips_conva_build;
 
 	VIPS_ARG_INT( class, "layers", 104, 
@@ -1289,7 +1287,7 @@ vips_conva_class_init( VipsConvaClass *class )
 		_( "Cluster" ), 
 		_( "Cluster lines closer than this in approximation" ),
 		VIPS_ARGUMENT_OPTIONAL_INPUT, 
-		G_STRUCT_OFFSET( VipsConv, cluster ), 
+		G_STRUCT_OFFSET( VipsConva, cluster ), 
 		1, 100, 1 ); 
 
 }
@@ -1313,12 +1311,14 @@ vips_conva_init( VipsConva *conva )
  * * @layers: %gint, number of layers for approximation
  * * @cluster: %gint, cluster lines closer than this distance
  *
- * Perform an approximate convolution of @in with @mask.
+ * Perform an approximate integer convolution of @in with @mask.
  * This is a low-level operation, see 
  * vips_conv() for something more convenient. 
  *
  * The output image 
  * always has the same #VipsBandFormat as the input image. 
+ * Elements of @mask are converted to
+ * integers before convolution.
  *
  * Larger values for @layers give more accurate
  * results, but are slower. As @layers approaches the mask radius, the
diff --git a/libvips/convolution/convasep.c b/libvips/convolution/convasep.c
index 92fcf11c..c49a8ee2 100644
--- a/libvips/convolution/convasep.c
+++ b/libvips/convolution/convasep.c
@@ -61,12 +61,16 @@
 	  we could then use orc to write a bit of code to implement this set 
 	  of lines
 
+	  stackoverflow has an algorithm for cumulativization using SIMD and 
+	  threads, see that font rasterization with rust piece on medium by 
+	  ralph levien
+
  */
 
 /*
- */
 #define DEBUG
 #define VIPS_DEBUG
+ */
 
 #ifdef HAVE_CONFIG_H
 #include <config.h>
@@ -883,7 +887,8 @@ vips_convasep_class_init( VipsConvasepClass *class )
 	gobject_class->get_property = vips_object_get_property;
 
 	object_class->nickname = "convasep";
-	object_class->description = _( "approximate separable convolution" );
+	object_class->description = 
+		_( "approximate separable integer convolution" );
 	object_class->build = vips_convasep_build;
 
 	VIPS_ARG_INT( class, "layers", 104, 
@@ -913,7 +918,7 @@ vips_convasep_init( VipsConvasep *convasep )
  *
  * * @layers: %gint, number of layers for approximation
  *
- * Approximate separable convolution. This is a low-level operation, see 
+ * Approximate separable integer convolution. This is a low-level operation, see 
  * vips_convsep() for something more convenient. 
  *
  * The image is convolved twice: once with @mask and then again with @mask 
diff --git a/libvips/convolution/convolution.c b/libvips/convolution/convolution.c
index b699f863..2650a40c 100644
--- a/libvips/convolution/convolution.c
+++ b/libvips/convolution/convolution.c
@@ -157,6 +157,7 @@ void
 vips_convolution_operation_init( void )
 {
 	extern int vips_conv_get_type( void ); 
+	extern int vips_conva_get_type( void ); 
 	extern int vips_convf_get_type( void ); 
 	extern int vips_convi_get_type( void ); 
 	extern int vips_convsep_get_type( void ); 
@@ -168,6 +169,7 @@ vips_convolution_operation_init( void )
 	extern int vips_gaussblur_get_type( void ); 
 
 	vips_conv_get_type(); 
+	vips_conva_get_type(); 
 	vips_convf_get_type(); 
 	vips_convi_get_type(); 
 	vips_compass_get_type(); 
diff --git a/libvips/include/vips/convolution.h b/libvips/include/vips/convolution.h
index 91f096d9..a79df95b 100644
--- a/libvips/include/vips/convolution.h
+++ b/libvips/include/vips/convolution.h
@@ -44,15 +44,17 @@ typedef enum {
 	VIPS_COMBINE_LAST
 } VipsCombine;
 
+int vips_conv( VipsImage *in, VipsImage **out, VipsImage *mask, ... )
+	__attribute__((sentinel));
 int vips_convf( VipsImage *in, VipsImage **out, VipsImage *mask, ... )
 	__attribute__((sentinel));
 int vips_convi( VipsImage *in, VipsImage **out, VipsImage *mask, ... )
 	__attribute__((sentinel));
-int vips_convasep( VipsImage *in, VipsImage **out, VipsImage *mask, ... )
+int vips_conva( VipsImage *in, VipsImage **out, VipsImage *mask, ... )
 	__attribute__((sentinel));
 int vips_convsep( VipsImage *in, VipsImage **out, VipsImage *mask, ... )
 	__attribute__((sentinel));
-int vips_conv( VipsImage *in, VipsImage **out, VipsImage *mask, ... )
+int vips_convasep( VipsImage *in, VipsImage **out, VipsImage *mask, ... )
 	__attribute__((sentinel));
 
 int vips_compass( VipsImage *in, VipsImage **out, VipsImage *mask, ... )
diff --git a/libvips/iofuncs/memory.c b/libvips/iofuncs/memory.c
index 21a11741..ca5e2cca 100644
--- a/libvips/iofuncs/memory.c
+++ b/libvips/iofuncs/memory.c
@@ -109,7 +109,15 @@ static GMutex *vips_tracked_mutex = NULL;
  * @OBJ: allocate memory local to @OBJ, or %NULL for no auto-free
  * @T: type of thing to allocate
  *
- * Returns: A pointer of type @T *, or %NULL on error.
+ * Allocate memory for a thing of type @T. The memory is not
+ * cleared. 
+ * 
+ * This macro cannot fail. See vips_tracked_malloc() if you are 
+ * allocating large amounts of memory.
+ *
+ * See also: vips_malloc().
+ *
+ * Returns: A pointer of type @T *.
  */
 
 /**
@@ -118,7 +126,15 @@ static GMutex *vips_tracked_mutex = NULL;
  * @N: number of @T 's to allocate
  * @T: type of thing to allocate
  *
- * Returns: A pointer of type @T *, or %NULL on error.
+ * Allocate memory for an array of objects of type @T. The memory is not
+ * cleared. 
+ *
+ * This macro cannot fail. See vips_tracked_malloc() if you are 
+ * allocating large amounts of memory.
+ *
+ * See also: vips_malloc().
+ *
+ * Returns: A pointer of type @T *.
  */
 
 static void
@@ -141,7 +157,7 @@ vips_malloc_cb( VipsObject *object, char *buf )
  *
  * See also: vips_tracked_malloc().
  *
- * Returns: (transfer full): a pointer to the allocated memory
+ * Returns: (transfer full): a pointer to the allocated memory.
  */
 void *
 vips_malloc( VipsObject *object, size_t size )
@@ -166,7 +182,7 @@ vips_malloc( VipsObject *object, size_t size )
  *
  * g_strdup() a string. When @object is freed, the string will be freed for
  * you.  If @object is %NULL, you need to 
- * free the memory explicitly with g_free().
+ * free the memory yourself with g_free().
  *
  * This function cannot fail. 
  *