From a5bef08d4a7902d0e929571c29630a7d1cb94e8f Mon Sep 17 00:00:00 2001
From: John Cupitt <jcupitt@gmail.com>
Date: Wed, 15 Jun 2016 13:56:19 +0100
Subject: [PATCH] better reducev multiplication

more accurate, no slower

add more tests too
---
 ChangeLog                    |  1 +
 libvips/resample/reducev.cpp | 71 ++++++++++++++++++++++--------------
 test/test_resample.py        | 10 +++++
 3 files changed, 54 insertions(+), 28 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 84451ffa..605714b1 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -23,6 +23,7 @@
 - VImage::write() return value changed from void to VImage to help chaining
 - added C++ arithmetic assignment overloads, += etc. 
 - VImage::ifthenelse() with double args was missing =0 on options
+- better accuracy for reducev with smarter multiplication
 
 18/5/16 started 8.3.2
 - more robust vips image reading
diff --git a/libvips/resample/reducev.cpp b/libvips/resample/reducev.cpp
index 49416b7d..7a16c77e 100644
--- a/libvips/resample/reducev.cpp
+++ b/libvips/resample/reducev.cpp
@@ -9,6 +9,8 @@
  * 2/4/16
  * 	- better int mask creation ... we now adjust the scale to keep the sum
  * 	  equal to the target scale
+ * 15/6/16
+ * 	- better accuracy with smarter multiplication
  */
 
 /*
@@ -178,25 +180,23 @@ vips_reducev_compile_section( VipsReducev *reducev, Pass *pass, gboolean first )
 
 	/* The value we fetch from the image, the accumulated sum.
 	 */
-	TEMP( "sum", 2 );
 	TEMP( "value", 2 );
-	TEMP( "valueb", 1 );
+	TEMP( "sum", 2 );
 
 	/* Init the sum. If this is the first pass, it's a constant. If this
 	 * is a later pass, we have to init the sum from the result 
 	 * of the previous pass. 
 	 */
 	if( first ) {
-		char zero[256];
+		char c0[256];
 
-		CONST( zero, 0, 2 );
-		ASM2( "loadpw", "sum", zero );
+		CONST( c0, 0, 2 );
+		ASM2( "loadpw", "sum", c0 );
 	}
 	else 
 		ASM2( "loadw", "sum", "r" );
 
 	for( i = pass->first; i < reducev->n_point; i++ ) {
-		char one[256];
 		char source[256];
 		char coeff[256];
 
@@ -205,21 +205,24 @@ vips_reducev_compile_section( VipsReducev *reducev, Pass *pass, gboolean first )
 		/* This mask coefficient.
 		 */
 		vips_snprintf( coeff, 256, "p%d", i );
-		pass->p[pass->n_param] = PARAM( coeff, 1 );
+		pass->p[pass->n_param] = PARAM( coeff, 2 );
 		pass->n_param += 1;
 		if( pass->n_param >= MAX_PARAM )
 			return( -1 );
 
 		/* Mask coefficients are 2.6 bits fixed point. We need to hold
 		 * about -0.5 to 1.0, so -2 to +1.999 is as close as we can
-		 * get. The image pixel must be signed too, so we shift right
-		 * to clear the top bit.
+		 * get. 
+		 *
+		 * We need a signed multiply, so the image pixel needs to
+		 * become a signed 16-bit value. We know only the bottom 8 bits
+		 * of the image and coefficient are interesting, so we can take
+		 * the bottom bits of a 16x16->32 multiply. 
 		 *
 		 * We accumulate the signed 16-bit result in sum.
 		 */
-		CONST( one, 1, 1 );
-		ASM3( "shrub", "valueb", source, one );
-		ASM3( "mulsbw", "value", "valueb", coeff );
+		ASM2( "convubw", "value", source );
+		ASM3( "mullw", "value", "value", coeff );
 		ASM3( "addssw", "sum", "sum", "value" );
 
 		/* We've used this coeff.
@@ -240,28 +243,24 @@ vips_reducev_compile_section( VipsReducev *reducev, Pass *pass, gboolean first )
 	 * image, otherwise write the 16-bit intermediate to our temp buffer. 
 	 */
 	if( pass->last >= reducev->n_point - 1 ) {
-		char thirtytwo[256];
-		char five[256];
-		char zero[256];
-		char twofivefive[256];
+		char c32[256];
+		char c6[256];
+		char c0[256];
+		char c255[256];
 
-		/* You'd think we might add 16 before dividing by 32, but we
-		 * are really dividing by 64, since we right-shifted the pixel
-		 * values on load above. So we add 32.
-		 */
-		CONST( thirtytwo, 32, 2 );
-		ASM3( "addw", "sum", "sum", thirtytwo );
-		CONST( five, 5, 2 );
-		ASM3( "shrsw", "sum", "sum", five );
+		CONST( c32, 32, 2 );
+		ASM3( "addw", "sum", "sum", c32 );
+		CONST( c6, 6, 2 );
+		ASM3( "shrsw", "sum", "sum", c6 );
 
 		/* You'd think "convsuswb", convert signed 16-bit to unsigned
 		 * 8-bit with saturation, would be quicker, but it's a lot
 		 * slower.
 		 */
-		CONST( zero, 0, 2 );
-		ASM3( "maxsw", "sum", zero, "sum" ); 
-		CONST( twofivefive, 255, 2 );
-		ASM3( "minsw", "sum", twofivefive, "sum" ); 
+		CONST( c0, 0, 2 );
+		ASM3( "maxsw", "sum", c0, "sum" ); 
+		CONST( c255, 255, 2 );
+		ASM3( "minsw", "sum", c255, "sum" ); 
 
 		ASM2( "convwb", "d1", "sum" );
 	}
@@ -657,6 +656,17 @@ vips_reducev_vector_gen( VipsRegion *out_region, void *vseq,
 		const int ty = (siy + 1) >> 1;
 		const int *cyo = reducev->matrixo[ty];
 
+#ifdef DEBUG
+		printf( "starting row %d\n", y + r->top ); 
+		printf( "coefficients:\n" );
+		for( int i = 0; i < reducev->n_point; i++ ) 
+			printf( "\t%d - %d\n", i, cyo[i] );
+		printf( "first column of pixel values:\n" ); 
+		for( int i = 0; i < reducev->n_point; i++ ) 
+			printf( "\t%d - %d\n", i, 
+				*VIPS_REGION_ADDR( ir, r->left, r->top + y + i ) ); 
+#endif /*DEBUG*/
+
 		/* We run our n passes to generate this scanline.
 		 */
 		for( int i = 0; i < reducev->n_pass; i++ ) {
@@ -676,6 +686,11 @@ vips_reducev_vector_gen( VipsRegion *out_region, void *vseq,
 
 			VIPS_SWAP( signed short *, seq->t1, seq->t2 );
 		}
+
+#ifdef DEBUG
+		printf( "pixel result:\n" );
+		printf( "\t%d\n", *q ); 
+#endif /*DEBUG*/
 	}
 
 	VIPS_GATE_STOP( "vips_reducev_vector_gen: work" ); 
diff --git a/test/test_resample.py b/test/test_resample.py
index 903a7aaa..2819879e 100755
--- a/test/test_resample.py
+++ b/test/test_resample.py
@@ -154,6 +154,16 @@ class TestResample(unittest.TestCase):
                     d = abs(r.avg() - im.avg())
                     self.assertLess(d, 2)
 
+        # try constant images ... should not change the constant
+        for const in [0, 1, 2, 254, 255]:
+            im = (Vips.Image.black(10, 10) + const).cast("uchar")
+            for kernel in ["nearest", "linear", "cubic", "lanczos2", "lanczos3"]:
+                # print "testing kernel =", kernel
+                # print "testing const =", const
+                shr = im.reduce(2, 2, kernel = kernel)
+                d = abs(shr.avg() - im.avg())
+                self.assertEqual(d, 0)
+
     def test_resize(self):
         im = Vips.Image.new_from_file("images/IMG_4618.jpg")
         im2 = im.resize(0.25)