From b23a1ece6c610a88f036f866505ea35ce8eb580d Mon Sep 17 00:00:00 2001
From: Nicolas Robidoux <nicolas.robidoux@gmail.com>
Date: Wed, 4 Mar 2009 22:09:54 +0000
Subject: [PATCH] nohalo resampler simplification/speed up

---
 libsrc/resample/nohalo.cpp | 575 +++++++++++++++++--------------------
 1 file changed, 269 insertions(+), 306 deletions(-)

diff --git a/libsrc/resample/nohalo.cpp b/libsrc/resample/nohalo.cpp
index bfe6bc36..29d41053 100644
--- a/libsrc/resample/nohalo.cpp
+++ b/libsrc/resample/nohalo.cpp
@@ -5,19 +5,20 @@
 
     This file is part of VIPS.
     
-    VIPS is free software; you can redistribute it and/or modify
-    it under the terms of the GNU Lesser General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+    VIPS is free software; you can redistribute it and/or modify it
+    under the terms of the GNU Lesser General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.
 
     This program is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Lesser General Public License for more details.
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
 
-    You should have received a copy of the GNU Lesser General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+    You should have received a copy of the GNU Lesser General Public
+    License along with this program; if not, write to the Free
+    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+    02111-1307 USA
 
  */
 
@@ -30,15 +31,16 @@
 /*
  * 2009 (c) Nicolas Robidoux 
  *
- * Thanks: Geert Jordaens, John Cupitt, Minglun Gong, Øyvind Kolås and
- * Sven Neumann for useful comments and code.
+ * Nicolas thanks Geert Jordaens, John Cupitt, Minglun Gong, Øyvind
+ * Kolås and Sven Neumann for useful comments and code.
  * 
- * Acknowledgement: Nicolas Robidoux's research on nohalo funded in
- * part by an NSERC (National Science and Engineering Research Council
- * of Canada) Discovery Grant.
+ * Nicolas Robidoux's research on nohalo funded in part by an NSERC
+ * (National Science and Engineering Research Council of Canada)
+ * Discovery Grant.
  */
 
 /* Hacked for vips by J. Cupitt, 20/1/09
+ * Tweaks by N. Robidoux, 03/3/09
  */
 
 /*
@@ -63,9 +65,9 @@
  * not change the answer, and consequently do not increase its
  * quality.
  *
- * ============================================================
- * WARNING: THIS CODE ONLY IMPLEMENTS THE LOWEST QUALITY NOHALO
- * ============================================================
+ * ===================================================
+ * THIS CODE ONLY IMPLEMENTS THE LOWEST QUALITY NOHALO
+ * ===================================================
  * 
  * This code implement nohalo for (quality) level = 1.  Nohalo for
  * higher quality levels will be implemented later.
@@ -114,11 +116,7 @@
  *
  * The value of the reconstructed intensity surface at any point
  * depends on the values of (at most) 12 nearby input values, located
- * in a "cross" centered at the closest four input pixel centers. For
- * computational expediency, the input values corresponding to the
- * nearest 21 input pixel locations (5x5 minus the four corners)
- * should be made available through a data pointer. The code then
- * selects the needed ones from this enlarged stencil.
+ * in a "cross" centered at the closest four input pixel centers.
  *
  * ===========================================================
  * When level = infinity, nohalo's intensity surface is smooth
@@ -140,25 +138,26 @@
  *
  * (Except possibly near the boundary: it is easy to make this
  * property carry over everywhere but this requires a tuned abyss
- * policy or building the boundary conditions inside the sampler.)
- * Nohalo is exact on linear intensity profiles, meaning that if the
- * input pixel values (in the stencil) are obtained from a function of
- * the form f(x,y) = a + b*x + c*y (a, b, c constants), then the
- * computed pixel value is exactly the value of f(x,y) at the
- * asked-for sampling location. The boundary condition which is
- * emulated by VIPS throught the "extend" extension of the input
- * image---this corresponds to the nearest neighbour abyss
- * policy---does NOT make this resampler exact on linears at the
- * boundary. It does, however, guarantee that no clamping is required
- * even when resampled values are computed at positions outside of the
- * extent of the input image (when extrapolation is required).
+ * policy---linear extrapolation, say---or building the boundary
+ * conditions inside the sampler.)  Nohalo is exact on linear
+ * intensity profiles, meaning that if the input pixel values (in the
+ * stencil) are obtained from a function of the form f(x,y) = a + b*x
+ * + c*y (a, b, c constants), then the computed pixel value is exactly
+ * the value of f(x,y) at the asked-for sampling location. The
+ * boundary condition which is emulated by VIPS throught the "extend"
+ * extension of the input image---this corresponds to the nearest
+ * neighbour abyss policy---does NOT make this resampler exact on
+ * linears at the boundary. It does, however, guarantee that no
+ * clamping is required even when resampled values are computed at
+ * positions outside of the extent of the input image (when
+ * extrapolation is required).
  *
  * ===================
  * Nohalo is nonlinear
  * ===================
  *
  * In particular, resampling a sum of images may not be the same as
- * summing the resamples (this occurs even without taking into account
+ * summing the resamples. (This occurs even without taking into account
  * over and underflow issues: images can only take values within a
  * banded range, and consequently no sampler is truly linear.)
  *
@@ -188,10 +187,6 @@
  * intensity surface can be treated as if smooth.)
  */
 
-/*
-#define DEBUG
- */
-
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif /*HAVE_CONFIG_H*/
@@ -274,260 +269,236 @@ typedef struct _VipsInterpolateNohaloClass {
 
 } VipsInterpolateNohaloClass;
 
-/* Calculate the four results surrounding the target point, our caller does
- * bilinear interpolation of them.
- */
-
 static void inline
 nohalo_sharp_level_1( 
+	const double uno_two,
+	const double uno_thr,
+	const double dos_one,
+	const double dos_two,
 	const double dos_thr,
 	const double dos_fou,
+	const double tre_one,
 	const double tre_two,
 	const double tre_thr,
 	const double tre_fou,
-	const double tre_fiv,
 	const double qua_two,
 	const double qua_thr,
-	const double qua_fou,
-	const double qua_fiv,
-	const double cin_thr,
-	const double cin_fou,
 	double *r1,
 	double *r2,
 	double *r3 )
 {
-	/* Start of copy-paste from Nicolas's source.
-	 */
-
-  /*
-   * THE ENLARGED STENCIL (prior to entering this function):
-   *
-   * The potentially needed input pixel values are described by the
-   * following stencil, where (ix,iy) are the coordinates of the
-   * closest input pixel center (with ties resolved arbitrarily).
-   *
-   * Spanish abbreviations are used to label positions from top to
-   * bottom (rows), English ones to label positions from left to right
-   * (columns).
-   *
-   *               (ix-1,iy-2)  (ix,iy-2)    (ix+1,iy-2)
-   *               = uno_two    = uno_thr    = uno_fou
-   *
-   *  (ix-2,iy-1)  (ix-1,iy-1)  (ix,iy-1)    (ix+1,iy-1)   (ix+2,iy-1)
-   *  = dos_one    = dos_two    = dos_thr    = dos_fou     = dos_fiv
-   *
-   *  (ix-2,iy)    (ix-1,iy)    (ix,iy)      (ix+1,iy)     (ix+2,iy)
-   *  = tre_one    = tre_two    = tre_thr    = tre_fou     = tre_fiv
-   *
-   *  (ix-2,iy+1)  (ix-1,iy+1)  (ix,iy+1)    (ix+1,iy+1)   (ix+2,iy+1)
-   *  = qua_one    = qua_two    = qua_thr    = qua_fou     = qua_fiv
-   *
-   *               (ix-1,iy+2)  (ix,iy+2)    (ix+1,iy+2)
-   *               = cin_two    = cin_thr    = cin_fou
-   *
-   * THE STENCIL OF ACTUALLY READ VALUES:
-   *
-   * The above is the "enlarged" stencil: about half the values will
-   * not be used.  Once symmetry has been used to assume that the
-   * sampling point is to the right and bottom of tre_thr---this is
-   * done by implicitly reflecting the data if needed---the actually
-   * used input values are named thus:
-   *
-   *                              dos_thr      dos_fou
-   *
-   *                 tre_two      tre_thr      tre_fou       tre_fiv
-   *
-   *                 qua_two      qua_thr      qua_fou       qua_fiv
-   *
-   *                              cin_thr      cin_fou
-   *
-   * (If, for exammple, relative_x_is_left is 1 but relative_y_is___up
-   * = 0, then dos_fou in this post-reflexion reduced stencil really
-   * corresponds to dos_two in the "enlarged" one, etc.)
-   *
-   * Given that the reflexions are performed "outside of the
-   * nohalo_sharp_level_1 function," the above 12 input values are the
-   * only ones which are read from the buffer.
+  /* 
+   * This function calculates the missing three double density pixel
+   * values. The caller does bilinear interpolation on them and
+   * dos_two.
+   */
+  /*
+   * THE STENCIL OF INPUT VALUES:
+   *
+   * Nohalo's stencil is the same as, say, Catmull-Rom, with the
+   * exception that the four corner values are not used:
+   *
+   *               (ix-1,iy-2)  (ix,iy-2)
+   *               = uno_two    = uno_thr
+   *
+   *  (ix-2,iy-1)  (ix-1,iy-1)  (ix,iy-1)    (ix+1,iy-1)
+   *  = dos_one    = dos_two    = dos_thr    = dos_fou
+   *
+   *  (ix-2,iy)    (ix-1,iy)    (ix,iy)      (ix+1,iy)
+   *  = tre_one    = tre_two    = tre_thr    = tre_fou
+   *
+   *               (ix-1,iy+1)  (ix,iy+1)
+   *               = qua_two    = qua_thr
+   *
+   * The indices associated with the values shown above are in the
+   * case that the resampling point is closer to (ix-1,iy-1) than the
+   * other three central positions. Pointer arithmetic is used to
+   * implicitly reflect the input stencil in the other three cases,
+   * For example, if the sampling position is closer to dos_two (that
+   * is, if relative_x_is_rite = 1 but relative_y_is_down = 0 below),
+   * then dos_two corresponds to (ix,iy-1), dos_thr corresponds to
+   * (ix-1,iy-1) etc. Consequently, the three missing double density
+   * values are halfway between dos_two and dos_thr, halfway between
+   * dos_two and tre_two, and at the average of the four central
+   * positions.
    */
-
   /*
    * Computation of the nonlinear slopes: If two consecutive pixel
    * value differences have the same sign, the smallest one (in
    * absolute value) is taken to be the corresponding slope; if the
    * two consecutive pixel value differences don't have the same sign,
-   * the corresponding slope is set to 0.
+   * the corresponding slope is set to 0. (In other words, apply
+   * minmod to comsecutive slopes.)
    */
+  /*
+   * Dos(s) horizontal differences:
+   */
+  const double prem_dos = dos_two - dos_one;
+  const double deux_dos = dos_thr - dos_two;
+  const double troi_dos = dos_fou - dos_thr;
   /*
    * Tre(s) horizontal differences:
    */
+  const double prem_tre = tre_two - tre_one;
   const double deux_tre = tre_thr - tre_two;
   const double troi_tre = tre_fou - tre_thr;
-  const double quat_tre = tre_fiv - tre_fou;
   /*
-   * Qua(ttro) horizontal differences:
+   * Two vertical differences:
    */
-  const double deux_qua = qua_thr - qua_two;
-  const double troi_qua = qua_fou - qua_thr;
-  const double quat_qua = qua_fiv - qua_fou;
+  const double prem_two = dos_two - uno_two;
+  const double deux_two = tre_two - dos_two;
+  const double troi_two = qua_two - tre_two;
   /*
    * Thr(ee) vertical differences:
    */
+  const double prem_thr = dos_thr - uno_thr;
   const double deux_thr = tre_thr - dos_thr;
   const double troi_thr = qua_thr - tre_thr;
-  const double quat_thr = cin_thr - qua_thr;
-  /*
-   * Fou(r) vertical differences:
-   */
-  const double deux_fou = tre_fou - dos_fou;
-  const double troi_fou = qua_fou - tre_fou;
-  const double quat_fou = cin_fou - qua_fou;
-
-  /*
-   * Tre:
-   */
-  const double half_sign_deux_tre = deux_tre >= 0. ? .5 : -.5;
-  const double half_sign_troi_tre = troi_tre >= 0. ? .5 : -.5;
-  const double half_sign_quat_tre = quat_tre >= 0. ? .5 : -.5;
-  /*
-   * Qua:
-   */
-  const double half_sign_deux_qua = deux_qua >= 0. ? .5 : -.5;
-  const double half_sign_troi_qua = troi_qua >= 0. ? .5 : -.5;
-  const double half_sign_quat_qua = quat_qua >= 0. ? .5 : -.5;
-  /*
-   * Thr:
-   */
-  const double half_sign_deux_thr = deux_thr >= 0. ? .5 : -.5;
-  const double half_sign_troi_thr = troi_thr >= 0. ? .5 : -.5;
-  const double half_sign_quat_thr = quat_thr >= 0. ? .5 : -.5;
-  /*
-   * Fou:
-   */
-  const double half_sign_deux_fou = deux_fou >= 0. ? .5 : -.5;
-  const double half_sign_troi_fou = troi_fou >= 0. ? .5 : -.5;
-  const double half_sign_quat_fou = quat_fou >= 0. ? .5 : -.5;
 
   /*
    * Useful later:
    */
-  const double tre_thr_plus_tre_fou  = tre_thr + tre_fou;
-  const double tre_thr_plus_qua_thr  = tre_thr + qua_thr;
-  const double qua_fou_minus_tre_thr = qua_fou - tre_thr;
+  const double dos_two_plus_dos_thr  = dos_two + dos_thr;
+  const double dos_two_plus_tre_two  = dos_two + tre_two;
 
+  /*
+   * Dos:
+   */
+  const double half_sign_prem_dos = prem_dos >= 0. ? .5 : -.5;
+  const double half_sign_deux_dos = deux_dos >= 0. ? .5 : -.5;
+  const double half_sign_troi_dos = troi_dos >= 0. ? .5 : -.5;
   /*
    * Tre:
    */
+  const double half_sign_prem_tre = prem_tre >= 0. ? .5 : -.5;
+  const double half_sign_deux_tre = deux_tre >= 0. ? .5 : -.5;
+  const double half_sign_troi_tre = troi_tre >= 0. ? .5 : -.5;
+  /*
+   * Two:
+   */
+  const double half_sign_prem_two = prem_two >= 0. ? .5 : -.5;
+  const double half_sign_deux_two = deux_two >= 0. ? .5 : -.5;
+  const double half_sign_troi_two = troi_two >= 0. ? .5 : -.5;
+  /*
+   * Thr:
+   */
+  const double half_sign_prem_thr = prem_thr >= 0. ? .5 : -.5;
+  const double half_sign_deux_thr = deux_thr >= 0. ? .5 : -.5;
+  const double half_sign_troi_thr = troi_thr >= 0. ? .5 : -.5;
+
+  /*
+   * Dos:
+   */
+  const double half_abs_prem_dos = half_sign_prem_dos * prem_dos;
+  const double sign_dos_two_horizo = half_sign_prem_dos + half_sign_deux_dos;
+  const double half_abs_deux_dos = half_sign_deux_dos * deux_dos;
+  const double sign_dos_thr_horizo = half_sign_deux_dos + half_sign_troi_dos;
+  const double half_abs_troi_dos = half_sign_troi_dos * troi_dos;
+  /*
+   * Two:
+   */
+  const double half_abs_prem_two = half_sign_prem_two * prem_two;
+  const double sign_dos_two_vertic = half_sign_prem_two + half_sign_deux_two;
+  const double half_abs_deux_two = half_sign_deux_two * deux_two;
+  const double sign_tre_two_vertic = half_sign_deux_two + half_sign_troi_two;
+  const double half_abs_troi_two = half_sign_troi_two * troi_two;
+  /*
+   * Tre:
+   */
+  const double half_abs_prem_tre = half_sign_prem_tre * prem_tre;
+  const double sign_tre_two_horizo = half_sign_prem_tre + half_sign_deux_tre;
   const double half_abs_deux_tre = half_sign_deux_tre * deux_tre;
   const double sign_tre_thr_horizo = half_sign_deux_tre + half_sign_troi_tre;
   const double half_abs_troi_tre = half_sign_troi_tre * troi_tre;
-  const double sign_tre_fou_horizo = half_sign_troi_tre + half_sign_quat_tre;
-  const double half_abs_quat_tre = half_sign_quat_tre * quat_tre;
   /*
    * Thr:
    */
+  const double half_abs_prem_thr = half_sign_prem_thr * prem_thr;
+  const double sign_dos_thr_vertic = half_sign_prem_thr + half_sign_deux_thr;
   const double half_abs_deux_thr = half_sign_deux_thr * deux_thr;
   const double sign_tre_thr_vertic = half_sign_deux_thr + half_sign_troi_thr;
   const double half_abs_troi_thr = half_sign_troi_thr * troi_thr;
-  const double sign_qua_thr_vertic = half_sign_troi_thr + half_sign_quat_thr;
-  const double half_abs_quat_thr = half_sign_quat_thr * quat_thr;
-  /*
-   * Qua:
-   */
-  const double half_abs_deux_qua = half_sign_deux_qua * deux_qua;
-  const double sign_qua_thr_horizo = half_sign_deux_qua + half_sign_troi_qua;
-  const double half_abs_troi_qua = half_sign_troi_qua * troi_qua;
-  const double sign_qua_fou_horizo = half_sign_troi_qua + half_sign_quat_qua;
-  const double half_abs_quat_qua = half_sign_quat_qua * quat_qua;
-  /*
-   * Fou:
-   */
-  const double half_abs_deux_fou = half_sign_deux_fou * deux_fou;
-  const double sign_tre_fou_vertic = half_sign_deux_fou + half_sign_troi_fou;
-  const double half_abs_troi_fou = half_sign_troi_fou * troi_fou;
-  const double sign_qua_fou_vertic = half_sign_troi_fou + half_sign_quat_fou;
-  const double half_abs_quat_fou = half_sign_quat_fou * quat_fou;
 
+  /*
+   * Useful later:
+   */
+  const double tre_thr_minus_dos_two = deux_thr + deux_dos;
+
+  /*
+   * Dos:
+   */
+  const double half_size_dos_two_horizo =
+    FAST_MIN( half_abs_prem_dos, half_abs_deux_dos );
+  const double half_size_dos_thr_horizo =
+    FAST_MIN( half_abs_troi_dos, half_abs_deux_dos );
+  /*
+   * Two:
+   */
+  const double half_size_dos_two_vertic =
+    FAST_MIN( half_abs_prem_two, half_abs_deux_two );
+  const double half_size_tre_two_vertic =
+    FAST_MIN( half_abs_troi_two, half_abs_deux_two );
   /*
    * Tre:
    */
+  const double half_size_tre_two_horizo =
+    FAST_MIN( half_abs_prem_tre, half_abs_deux_tre );
   const double half_size_tre_thr_horizo =
-    FAST_MIN( half_abs_deux_tre, half_abs_troi_tre );
-  const double half_size_tre_fou_horizo =
-    FAST_MIN( half_abs_quat_tre, half_abs_troi_tre );
+    FAST_MIN( half_abs_troi_tre, half_abs_deux_tre );
   /*
    * Thr:
    */
+  const double half_size_dos_thr_vertic =
+    FAST_MIN( half_abs_prem_thr, half_abs_deux_thr );
   const double half_size_tre_thr_vertic =
-    FAST_MIN( half_abs_deux_thr, half_abs_troi_thr );
-  const double half_size_qua_thr_vertic =
-    FAST_MIN( half_abs_quat_thr, half_abs_troi_thr );
-  /*
-   * Qua:
-   */
-  const double half_size_qua_thr_horizo =
-    FAST_MIN( half_abs_deux_qua, half_abs_troi_qua );
-  const double half_size_qua_fou_horizo =
-    FAST_MIN( half_abs_quat_qua, half_abs_troi_qua );
-  /*
-   * Fou:
-   */
-  const double half_size_tre_fou_vertic =
-    FAST_MIN( half_abs_deux_fou, half_abs_troi_fou );
-  const double half_size_qua_fou_vertic =
-    FAST_MIN( half_abs_quat_fou, half_abs_troi_fou );
+    FAST_MIN( half_abs_troi_thr, half_abs_deux_thr );
 
   /*
-   * Compute the needed "right" (at the boundary between two input
+   * Compute the needed "right" (at the boundary between one input
    * pixel areas) double resolution pixel value:
    */
-  /*
-   * Tre:
-   */
-  const double two_times_tre_thrfou =
-    tre_thr_plus_tre_fou
+  const double two_times_dos_twothr =
+    dos_two_plus_dos_thr
     +
-    sign_tre_thr_horizo * half_size_tre_thr_horizo
+    sign_dos_two_horizo * half_size_dos_two_horizo
     -
-    sign_tre_fou_horizo * half_size_tre_fou_horizo;
+    sign_dos_thr_horizo * half_size_dos_thr_horizo;
 
   /*
    * Compute the needed "down" double resolution pixel value:
    */
-  /*
-   * Thr:
-   */
-  const double two_times_trequa_thr =
-    tre_thr_plus_qua_thr
+  const double two_times_dostre_two =
+    dos_two_plus_tre_two
     +
-    sign_tre_thr_vertic * half_size_tre_thr_vertic
+    sign_dos_two_vertic * half_size_dos_two_vertic
     -
-    sign_qua_thr_vertic * half_size_qua_thr_vertic;
+    sign_tre_two_vertic * half_size_tre_two_vertic;
 
   /*
-   * Compute the "diagonal" (at the boundary between four input
+   * Compute the "diagonal" (at the boundary between thrr input
    * pixel areas) double resolution pixel value:
    */
-  const double four_times_trequa_thrfou =
-    qua_fou_minus_tre_thr
+  const double four_times_dostre_twothr =
+    tre_thr_minus_dos_two
     +
-    sign_qua_thr_horizo * half_size_qua_thr_horizo
+    sign_tre_two_horizo * half_size_tre_two_horizo
     -
-    sign_qua_fou_horizo * half_size_qua_fou_horizo
+    sign_tre_thr_horizo * half_size_tre_thr_horizo
     +
-    sign_tre_fou_vertic * half_size_tre_fou_vertic
+    sign_dos_thr_vertic * half_size_dos_thr_vertic
     -
-    sign_qua_fou_vertic * half_size_qua_fou_vertic
+    sign_tre_thr_vertic * half_size_tre_thr_vertic
     +
-    two_times_tre_thrfou
+    two_times_dos_twothr
     +
-    two_times_trequa_thr;
+    two_times_dostre_two;
 
-	/* End of copy-paste from Nicolas' source.
-	 */
-
-	*r1 = two_times_tre_thrfou;
-	*r2 = two_times_trequa_thr;
-	*r3 = four_times_trequa_thrfou;
+  /*
+   * Return the newly computed double density values:
+   */
+  *r1 = two_times_dos_twothr;
+  *r2 = two_times_dostre_two;
+  *r3 = four_times_dostre_twothr;
 }
 
 /* Call nohalo_sharp_level_1 with an interpolator as a parameter.
@@ -536,94 +507,85 @@ nohalo_sharp_level_1(
  */
 #define NOHALO_SHARP_LEVEL_1_INTER( inter ) \
   template <typename T> static void inline \
-  nohalo_sharp_level_1_ ## inter( PEL *pout, \
-                                  const PEL *pin, \
-                                  const int bands, \
-                                  const int lskip, \
-                                  const double relative_x, \
-                                  const double relative_y ) \
+  nohalo_sharp_level_1_ ## inter(       PEL    *pout, \
+                                  const PEL    *pin, \
+                                  const int     bands, \
+                                  const int     lskip, \
+                                  const double  relative_x, \
+                                  const double  relative_y ) \
   { \
     T* restrict out = (T *) pout; \
-    const T* restrict in = (T *) pin; \
     \
-    const int relative_x_is_left = ( relative_x < 0. ); \
-    const int relative_y_is___up = ( relative_y < 0. ); \
+    const int relative_x_is_rite = ( relative_x >= 0. ); \
+    const int relative_y_is_down = ( relative_y >= 0. ); \
+    \
+    const int sign_of_relative_x = 2 * relative_x_is_rite - 1; \
+    const int sign_of_relative_y = 2 * relative_y_is_down - 1; \
     \
     const int corner_reflection_shift = \
-        ( -2 + 4 * relative_x_is_left ) * bands \
-        + \
-        ( -2 + 4 * relative_y_is___up ) * lskip; \
+      relative_x_is_rite * bands + relative_y_is_down * lskip; \
     \
-    const int sign_of_relative_x = 1 - 2 * relative_x_is_left; \
-    const int sign_of_relative_y = 1 - 2 * relative_y_is___up; \
-    \
-    const double x = ( 2 * sign_of_relative_x ) * relative_x; \
-    const double y = ( 2 * sign_of_relative_y ) * relative_y; \
-    \
-    const double x_times_y = x * y; \
-    const double w_times_y = y - x_times_y; \
-    const double x_times_z = x - x_times_y; \
-    const double w_times_z = 1. - x - w_times_y; \
-    \
-    const double x_times_y_over_4 = .25 * x_times_y; \
-    const double w_times_y_over_2 = .5  * w_times_y; \
-    const double x_times_z_over_2 = .5  * x_times_z; \
+    const T* restrict in = ( (T *) pin ) + corner_reflection_shift; \
     \
     const int shift_1_pixel  = sign_of_relative_x * bands; \
     const int shift_1_row    = sign_of_relative_y * lskip; \
     \
-    const int b1 =     shift_1_pixel + corner_reflection_shift; \
-    const int b2 = 2 * shift_1_pixel + corner_reflection_shift; \
-    const int b3 = 3 * shift_1_pixel + corner_reflection_shift; \
-    const int b4 = 4 * shift_1_pixel + corner_reflection_shift; \
+    const double w = ( 2 * sign_of_relative_x ) * relative_x; \
+    const double z = ( 2 * sign_of_relative_y ) * relative_y; \
     \
-    const int l1 =     shift_1_row; \
-    const int l2 = 2 * shift_1_row; \
-    const int l3 = 3 * shift_1_row; \
-    const int l4 = 4 * shift_1_row; \
+    const int uno_two_shift = shift_1_row; \
+    const int uno_thr_shift = shift_1_row - shift_1_pixel; \
     \
-    for( int z = 0; z < bands; z++ ) { \
-      const T dos_thr = in[b2 + l1]; \
-      const T dos_fou = in[b3 + l1]; \
+    const int dos_one_shift = shift_1_pixel; \
+    const int dos_two_shift = 0; \
+    const int dos_thr_shift = -shift_1_pixel; \
+    const int dos_fou_shift = -2 * shift_1_pixel; \
+    \
+    const int tre_one_shift = dos_one_shift - shift_1_row; \
+    const int tre_two_shift = -shift_1_row; \
+    const int tre_thr_shift = dos_thr_shift - shift_1_row; \
+    const int tre_fou_shift = dos_fou_shift - shift_1_row; \
+    \
+    const int qua_two_shift = tre_two_shift - shift_1_row; \
+    const int qua_thr_shift = tre_thr_shift - shift_1_row; \
+    \
+    const double x = 1. - w; \
+    const double w_times_z = w * z; \
+    const double x_times_z = x * z; \
+    const double w_times_y_over_2 = .5  * ( w - w_times_z ); \
+    const double x_times_z_over_2 = .5  * x_times_z; \
+    const double x_times_y_over_4 = .25 * ( x - x_times_z ); \
+    \
+    for( int band = 0; band < bands; band++ ) { \
+      double two_times_dos_twothr; \
+      double two_times_dostre_two; \
+      double four_times_dostre_twothr; \
       \
-      const T tre_two = in[b1 + l2]; \
-      const T tre_thr = in[b2 + l2]; \
-      const T tre_fou = in[b3 + l2]; \
-      const T tre_fiv = in[b4 + l2]; \
-                                     \
-      const T qua_two = in[b1 + l3]; \
-      const T qua_thr = in[b2 + l3]; \
-      const T qua_fou = in[b3 + l3]; \
-      const T qua_fiv = in[b4 + l3]; \
-                                     \
-      const T cin_thr = in[b2 + l4]; \
-      const T cin_fou = in[b3 + l4]; \
-                                     \
-      double two_times_tre_thrfou;   \
-      double two_times_trequa_thr;   \
-      double four_times_trequa_thrfou; \
+      const double dos_two = in[dos_two_shift]; \
       \
-      nohalo_sharp_level_1( dos_thr, dos_fou, \
-                            tre_two, tre_thr, tre_fou, tre_fiv, \
-                            qua_two, qua_thr, qua_fou, qua_fiv, \
-                            cin_thr, cin_fou, \
-                            &two_times_tre_thrfou, \
-                            &two_times_trequa_thr, \
-                            &four_times_trequa_thrfou ); \
+      nohalo_sharp_level_1( in[uno_two_shift], in[uno_thr_shift], \
+                            in[dos_one_shift], dos_two, \
+                            in[dos_thr_shift], in[dos_fou_shift], \
+                            in[tre_one_shift], in[tre_two_shift], \
+                            in[tre_thr_shift], in[tre_fou_shift], \
+                            in[qua_two_shift], in[qua_thr_shift], \
+                            &two_times_dos_twothr, \
+                            &two_times_dostre_two, \
+                            &four_times_dostre_twothr ); \
+      \
+      in += 1; \
       \
       const T result = bilinear_ ## inter<T>( \
                                              w_times_z, \
                                              x_times_z_over_2, \
                                              w_times_y_over_2, \
                                              x_times_y_over_4, \
-                                             tre_thr, \
-                                             two_times_tre_thrfou, \
-                                             two_times_trequa_thr, \
-                                             four_times_trequa_thrfou ); \
+                                             dos_two, \
+                                             two_times_dos_twothr, \
+                                             two_times_dostre_two, \
+                                             four_times_dostre_twothr ); \
       \
-      out[z] = result; \
-      \
-      in += 1; \
+      out[band] = result; \
     } \
   } 
 
@@ -640,44 +602,45 @@ G_DEFINE_TYPE( VipsInterpolateNohalo, vips_interpolate_nohalo,
 
 static void
 vips_interpolate_nohalo_interpolate( VipsInterpolate *interpolate, 
-                                     PEL *out,
-                                     REGION *in,
-                                     double absolute_x,
-                                     double absolute_y )
+                                     PEL             *out,
+                                     REGION          *in,
+                                     double           absolute_x,
+                                     double           absolute_y )
 {
   /*
    * floor's surrogate FAST_PSEUDO_FLOOR is used to make sure that the
    * transition through 0 is smooth. If it is known that absolute_x
-   * and absolute_y will never be less than -.5, plain cast---that is,
-   * const int ix = absolute_x + .5---should be used instead.  Any
-   * function which agrees with floor for non-integer values, and
+   * and absolute_y will never be less than 0, plain cast---that is,
+   * const int ix = absolute_x---should be used instead.  Actually,
+   * any function which agrees with floor for non-integer values, and
    * picks one of the two possibilities for integer values, can be
-   * used.
+   * used. FAST_PSEUDO_FLOOR fits the bill.
    */
-  const int ix = FAST_PSEUDO_FLOOR (absolute_x + 0.5);
-  const int iy = FAST_PSEUDO_FLOOR (absolute_y + 0.5);
+  const int ix = FAST_PSEUDO_FLOOR (absolute_x);
+  const int iy = FAST_PSEUDO_FLOOR (absolute_y);
 
-	/* Move the pointer to (the first band of) the central
-           pixel of the extended 5x5 stencil (tre_thr):
-	 */
-	const PEL * restrict p =
-		(PEL *) IM_REGION_ADDR( in, ix, iy );
+  /*
+   * Move the pointer to (the first band of) the top/left pixel
+   * of the 2x2 group of pixel centers which contains the
+   * sampling location in its convex hull:
+   */
+  const PEL * restrict p = (PEL *) IM_REGION_ADDR( in, ix, iy );
 
-	/* VIPS versions of Nicolas's pixel addressing values.
-	 */
-	const int bands = in->im->Bands;
-	const int lskip = 
-		IM_REGION_LSKIP( in ) / IM_IMAGE_SIZEOF_ELEMENT( in->im );
+  /*
+   * VIPS versions of Nicolas's pixel addressing values:
+   */
+  const int bands = in->im->Bands;
+  const int lskip = IM_REGION_LSKIP( in ) / IM_IMAGE_SIZEOF_ELEMENT( in->im );
 
   /*
    * x is the x-coordinate of the sampling point relative to the
-   * position of the tre_thr pixel center. Similarly for y. Range of
-   * values: (-.5,.5].
+   * position of the center of the convex hull of the 2x2 block of
+   * closest pixels. Similarly for y. Range of values: [-.5,.5).
    */
-  const double relative_x = absolute_x - ix;
-  const double relative_y = absolute_y - iy;
+  const double relative_x = absolute_x - ix - .5;
+  const double relative_y = absolute_y - iy - .5;
 
-#define CALL( T, inter )      \
+#define CALL( T, inter ) \
   nohalo_sharp_level_1_ ## inter<T>( out, \
                                      p, \
                                      bands, \
@@ -750,11 +713,11 @@ vips_interpolate_nohalo_class_init( VipsInterpolateNohaloClass *klass )
 		VIPS_INTERPOLATE_CLASS( klass );
 
 	object_class->nickname = "nohalo";
-	object_class->description = _( "Bilinear plus edge enhance" );
+	object_class->description = _( "Edge-enhancing bilinear" );
 
 	interpolate_class->interpolate = 
 		vips_interpolate_nohalo_interpolate;
-	interpolate_class->window_size = 5;
+	interpolate_class->window_size = 4;
 }
 
 static void