diff --git a/TODO b/TODO
index 09ba1f55..d7c52a41 100644
--- a/TODO
+++ b/TODO
@@ -1,11 +1,18 @@
-- convasep does not handle offset correctly
+- tests for convasep ... just check it matches convsep +/- some small amount
+
+- try this blur.mat
+
+	6 1 3896 
+	20 22 24 27 30 32
+ 
+  ie. missing offset, though scale is there ... is not recognised
 
 - test new clip stuff?
 
 
 
 
-
+- worley could output float distance by default?
 
 - add more webp tests to py suite
 
diff --git a/libvips/convolution/conv.c b/libvips/convolution/conv.c
index a4326edc..1f247b6a 100644
--- a/libvips/convolution/conv.c
+++ b/libvips/convolution/conv.c
@@ -175,9 +175,9 @@ vips_conv_init( VipsConv *conv )
  *
  * Optional arguments:
  *
- * * @precision: calculation accuracy
- * * @layers: number of layers for approximation
- * * @cluster: cluster lines closer than this distance
+ * * @precision: #VipsPrecision, calculation accuracy
+ * * @layers: %gint, number of layers for approximation
+ * * @cluster: %gint, cluster lines closer than this distance
  *
  * Convolution. 
  *
@@ -190,8 +190,10 @@ vips_conv_init( VipsConv *conv )
  *
  * where scale and offset are part of @mask. 
  *
- * If @precision is #VIPS_PRECISION_INTEGER then the convolution is performed
- * with integer arithmetic and the output image 
+ * If @precision is #VIPS_PRECISION_INTEGER, then 
+ * elements of @mask are converted to
+ * integers before convolution, using rint(),
+ * and the output image 
  * always has the same #VipsBandFormat as the input image. 
  *
  * For #VIPS_FORMAT_UCHAR images, vips_conv() uses a fast vector path based on
@@ -204,7 +206,9 @@ vips_conv_init( VipsConv *conv )
  * is always #VIPS_FORMAT_FLOAT unless @in is #VIPS_FORMAT_DOUBLE, in which case
  * @out is also #VIPS_FORMAT_DOUBLE. 
  *
- * If @precision is #VIPS_PRECISION_APPROXIMATE then the output image 
+ * If @precision is #VIPS_PRECISION_APPROXIMATE then, like
+ * #VIPS_PRECISION_INTEGER, @mask is converted to int before convolution, and 
+ * the output image 
  * always has the same #VipsBandFormat as the input image. 
  *
  * Larger values for @layers give more accurate
@@ -216,6 +220,8 @@ vips_conv_init( VipsConv *conv )
  * Smaller values of @cluster will give more accurate results, but be slower
  * and use more memory. 10% of the mask radius is a good rule of thumb.
  *
+ * See also: vips_convsep().
+ *
  * Returns: 0 on success, -1 on error
  */
 int 
diff --git a/libvips/convolution/convasep.c b/libvips/convolution/convasep.c
index a53bb788..92fcf11c 100644
--- a/libvips/convolution/convasep.c
+++ b/libvips/convolution/convasep.c
@@ -53,11 +53,6 @@
 
   TODO
 
-  	- are we handling mask offset correctly?
-
-	  nope! we have area and rounding, but neither properly incorporates 
-	  offset
-
 	- how about making a cumulative image and then subtracting points in 
 	  that, rather than keeping a set of running totals
 
@@ -68,14 +63,10 @@
 
  */
 
-/* Show sample pixels as they are transformed.
-#define DEBUG_PIXELS
- */
-
 /*
+ */
 #define DEBUG
 #define VIPS_DEBUG
- */
 
 #ifdef HAVE_CONFIG_H
 #include <config.h>
@@ -306,7 +297,7 @@ vips_convasep_decompose( VipsConvasep *convasep )
 	for( z = 0; z < convasep->width; z++ ) 
 		sum += coeff[z];
 
-	convasep->area = rint( sum * convasep->area / scale );
+	convasep->area = VIPS_RINT( sum * convasep->area / scale );
 	convasep->rounding = (convasep->area + 1) / 2;
 	convasep->offset = offset;
 
@@ -463,8 +454,11 @@ G_STMT_START { \
 				isum[z] += p[x]; \
 			sum += convasep->factor[z] * isum[z]; \
 		} \
-		sum = (sum + convasep->rounding) / convasep->area + \
-			convasep->offset; \
+		\
+		/* Don't add offset ... we only want to do that once, do it on \
+		 * the vertical pass. \
+		 */ \
+		sum = (sum + convasep->rounding) / convasep->area; \
 		CLIP( sum ); \
 		*q = sum; \
 		q += ostride; \
@@ -477,8 +471,7 @@ G_STMT_START { \
 				sum += convasep->factor[z] * isum[z]; \
 			} \
 			p += istride; \
-			sum = (sum + convasep->rounding) / convasep->area + \
-				convasep->offset; \
+			sum = (sum + convasep->rounding) / convasep->area; \
 			CLIP( sum ); \
 			*q = sum; \
 			q += ostride; \
@@ -504,7 +497,11 @@ G_STMT_START { \
 				dsum[z] += p[x]; \
 			sum += convasep->factor[z] * dsum[z]; \
 		} \
-		sum = sum / convasep->area + convasep->offset; \
+		\
+		/* Don't add offset ... we only want to do that once, do it on \
+		 * the vertical pass. \
+		 */ \
+		sum = sum / convasep->area; \
 		*q = sum; \
 		q += ostride; \
 		\
@@ -516,7 +513,7 @@ G_STMT_START { \
 				sum += convasep->factor[z] * dsum[z]; \
 			} \
 			p += istride; \
-			sum = sum / convasep->area + convasep->offset; \
+			sum = sum / convasep->area; \
 			*q = sum; \
 			q += ostride; \
 		} \
@@ -830,6 +827,7 @@ vips_convasep_pass( VipsConvasep *convasep,
 static int 
 vips_convasep_build( VipsObject *object )
 {
+	VipsObjectClass *class = VIPS_OBJECT_GET_CLASS( object );
 	VipsConvolution *convolution = (VipsConvolution *) object;
 	VipsConvasep *convasep = (VipsConvasep *) object;
 	VipsImage **t = (VipsImage **) vips_object_local_array( object, 4 );
@@ -839,6 +837,9 @@ vips_convasep_build( VipsObject *object )
 	if( VIPS_OBJECT_CLASS( vips_convasep_parent_class )->build( object ) )
 		return( -1 );
 
+	if( vips_check_separable( class->nickname, convolution->M ) ) 
+                return( -1 );
+
 	/* An int version of our mask.
 	 */
 	if( vips__image_intize( convolution->M, &t[3] ) )
@@ -915,12 +916,11 @@ vips_convasep_init( VipsConvasep *convasep )
  * Approximate separable convolution. This is a low-level operation, see 
  * vips_convsep() for something more convenient. 
  *
- * The mask must be 1xn or nx1 elements. 
- * The output image 
- * always has the same #VipsBandFormat as the input image. 
- *
  * The image is convolved twice: once with @mask and then again with @mask 
  * rotated by 90 degrees. 
+ * @mask must be 1xn or nx1 elements. 
+ * Elements of @mask are converted to
+ * integers before convolution.
  *
  * Larger values for @layers give more accurate
  * results, but are slower. As @layers approaches the mask radius, the
@@ -928,7 +928,10 @@ vips_convasep_init( VipsConvasep *convasep )
  * match. For many large masks, such as Gaussian, @layers need be only 10% of
  * this value and accuracy will still be good.
  *
- * See also: vips_conv().
+ * The output image 
+ * always has the same #VipsBandFormat as the input image. 
+ *
+ * See also: vips_convsep().
  *
  * Returns: 0 on success, -1 on error
  */
diff --git a/libvips/convolution/convsep.c b/libvips/convolution/convsep.c
index 27299c0a..d428e734 100644
--- a/libvips/convolution/convsep.c
+++ b/libvips/convolution/convsep.c
@@ -83,8 +83,14 @@ vips_convsep_build( VipsObject *object )
 		in = t[0];
 	}
 	else { 
-		if( vips_rot( convolution->M, &t[0], VIPS_ANGLE_D90, NULL ) ||
-			vips_conv( convolution->in, &t[1], convolution->M, 
+		if( vips_rot( convolution->M, &t[0], VIPS_ANGLE_D90, NULL ) )
+			return( -1 ); 
+
+		/* We must only add the offset once.
+		 */
+		vips_image_set_double( t[0], "offset", 0 );
+
+		if( vips_conv( convolution->in, &t[1], convolution->M, 
 				"precision", convsep->precision,
 				"layers", convsep->layers,
 				"cluster", convsep->cluster,