diff --git a/ChangeLog b/ChangeLog
index de702aa0..84a4bace 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -5,6 +5,8 @@
 - add -lstdc++ to vips-7.xx.pc, if we used it
 - im_vips2png() / im_png2vips() set / get png resolution (thanks Zhiyu Wu)
 - updated README
+- don't use tables for bilinear on float data for a small speedup (thanks
+  Nicolas)
 
 30/11/10 started 7.24.0
 - bump for new stable
diff --git a/TODO b/TODO
index 2bbbd25c..66b356b6 100644
--- a/TODO
+++ b/TODO
@@ -1,3 +1,19 @@
+- bilinear benchmarks
+
+	8-bit image, tables, fixed-point, 3.7s (7.24 one)
+	8-bit image, no tables, float, 5.2s
+	8-bit image, no tables, fixed-point, 4.2s
+
+	float image, tables, 5.2s (7.24 one)
+	float image, no tables, sp float coeff, 4.2s
+	float image, no tables, dp float coeff, 4.8s
+
+
+
+
+
+
+
 
 
 - im_conv()/im_morph() could have more than 10 programs? try 20 and see if we
@@ -8,6 +24,8 @@
 
 - fits save
 
+- lazy fits load
+
 
 
 
diff --git a/libvips/format/fits.c b/libvips/format/fits.c
index ef1886cb..f281879b 100644
--- a/libvips/format/fits.c
+++ b/libvips/format/fits.c
@@ -60,6 +60,12 @@
 #include <dmalloc.h>
 #endif /*WITH_DMALLOC*/
 
+/* vips only supports 3 dimensions, but we allow up to MAX_DIMENSIONS as long
+ * as the higher dimensions are all empty. If you change this value, change
+ * fits2vips_get_header() as well.
+ */
+#define MAX_DIMENSIONS (10)
+
 /* What we track during a cfitsio-file read.
  */
 typedef struct {
@@ -69,7 +75,7 @@ typedef struct {
 	fitsfile *fptr;
 	int datatype;
 	int naxis;
-	long long int naxes[10];
+	long long int naxes[MAX_DIMENSIONS];
 } Read;
 
 static void
@@ -154,15 +160,18 @@ fits2vips_get_header( Read *read )
 		return( -1 );
 	}
 
+#ifdef DEBUG
 	printf( "naxis = %d\n", read->naxis );
 	for( i = 0; i < read->naxis; i++ )
 		printf( "%d) %lld\n", i, read->naxes[i] );
+#endif /*DEBUG*/
 
 	width = 1;
 	height = 1;
 	bands = 1;
 	switch( read->naxis ) {
-	/* If you add more dimensions here, adjust data read below.
+	/* If you add more dimensions here, adjust data read below. See also
+	 * the definition of MAX_DIMENSIONS above.
 	 */
 	case 10:
 	case 9:
@@ -284,8 +293,10 @@ fits2vips_header( const char *filename, IMAGE *out )
 	return( 0 );
 }
 
+/* Read the whole image in scanlines.
+ */
 static int
-fits2vips_get_data( Read *read )
+fits2vips_get_data_scanlinewise( Read *read )
 {
 	IMAGE *im = read->out;
 	const int es = IM_IMAGE_SIZEOF_ELEMENT( im );
@@ -296,6 +307,8 @@ fits2vips_get_data( Read *read )
 	int x, y, b, z;
 	int status;
 
+	long fpixel[MAX_DIMENSIONS];
+
 	status = 0;
 
 	if( !(line_buffer = IM_ARRAY( im, IM_IMAGE_SIZEOF_LINE( im ), PEL )) ||
@@ -305,14 +318,9 @@ fits2vips_get_data( Read *read )
 		return( -1 );
 
 	for( y = 0; y < im->Ysize; y++ ) {
-		/* Keep max no of dimensions in line with the header check
-		 * above.
-		 */
-		long int fpixel[10];
-
 		/* Start of scanline. We have to read top-to-bottom.
 		 */
-		for( b = 0; b < 10; b++ )
+		for( b = 0; b < MAX_DIMENSIONS; b++ )
 			fpixel[b] = 1;
 		fpixel[1] = im->Ysize - y;
 
@@ -347,6 +355,94 @@ fits2vips_get_data( Read *read )
 	return( 0 );
 }
 
+static int
+fits2vips_generate( REGION *out, void *seq, void *a, void *b )
+{
+	Read *read = (Read *) a;
+	Rect *r = &out->valid;
+
+	IMAGE *im = read->out;
+	const int es = IM_IMAGE_SIZEOF_ELEMENT( im );
+
+	PEL *line_buffer;
+	PEL *band_buffer;
+	PEL *p, *q;
+	int x, y, z, k;
+	int status;
+
+	status = 0;
+
+	long fpixel[MAX_DIMENSIONS];
+	long lpixel[MAX_DIMENSIONS];
+	long inc[MAX_DIMENSIONS];
+
+	if( !(line_buffer = IM_ARRAY( im, IM_IMAGE_SIZEOF_LINE( im ), PEL )) ||
+		!(band_buffer = IM_ARRAY( im, es * im->Xsize, PEL )) ||
+		im_outcheck( im ) ||
+		im_setupout( im ) )
+		return( -1 );
+
+	/* Read out the entire
+	for( b = 0; b < MAX_DIMENSIONS; b++ )
+		fpixel[b] = 1;
+	fpixel[1] = im->Ysize - y;
+
+	if( fits_read_subset( read->fptr, read->datatype, 
+		long *fpixel,
+			             long *lpixel, long *inc, void *nulval,  void *array,
+				                  int *anynul, int *status)
+	 */
+
+
+	for( y = 0; y < im->Ysize; y++ ) {
+		long int fpixel[MAX_DIMENSIONS];
+
+		/* Start of scanline. We have to read top-to-bottom.
+		 */
+		for( z = 0; z < MAX_DIMENSIONS; z++ )
+			fpixel[z] = 1;
+		fpixel[1] = im->Ysize - y;
+
+		for( z = 0; z < im->Bands; z++ ) {
+			fpixel[2] = z + 1;
+
+			/* Read one band of one scanline, then scatter-write
+			 * into the line buffer.
+			 */
+			if( fits_read_pix( read->fptr, 
+				read->datatype, fpixel, im->Xsize,
+				NULL, band_buffer, NULL, &status ) ) {
+				read_error( status );
+				return( -1 );
+			}
+
+			p = band_buffer;
+			q = line_buffer + z * es;
+			for( x = 0; x < im->Xsize; x++ ) {
+				for( k = 0; k < es; k++ )
+					q[k] = p[k];
+
+				p += es;
+				q += im->Bands * es;
+			}
+		}
+
+		if( im_writeline( y, im, line_buffer ) )
+			return( -1 );
+	}
+
+	return( 0 );
+}
+
+/* Read the image in chunks on demand.
+ */
+static int
+fits2vips_get_data_lazy( Read *read )
+{
+	return( im_generate( read->out, 
+		NULL, fits2vips_generate, NULL, read, NULL ) );
+}
+
 /**
  * im_fits2vips:
  * @filename: file to load
@@ -371,7 +467,7 @@ im_fits2vips( const char *filename, IMAGE *out )
 	if( !(read = read_new( filename, out )) ) 
 		return( -1 );
 	if( fits2vips_get_header( read ) ||
-		fits2vips_get_data( read ) ) {
+		fits2vips_get_data_scanlinewise( read ) ) {
 		read_destroy( read );
 		return( -1 );
 	}
diff --git a/libvips/resample/interpolate.c b/libvips/resample/interpolate.c
index 95bcf90d..849cbc10 100644
--- a/libvips/resample/interpolate.c
+++ b/libvips/resample/interpolate.c
@@ -7,6 +7,9 @@
  * 	  defaults to (window_size / 2 - 1), so for a 4x4 stencil (eg.
  * 	  bicubic) we have an offset of 1
  * 	- tiny speedups
+ * 7/1/11
+ * 	- don't use tables for bilinear on float data for a small speedup
+ * 	  (thanks Nicolas)
  */
 
 /*
@@ -323,15 +326,12 @@ typedef VipsInterpolateClass VipsInterpolateBilinearClass;
 G_DEFINE_TYPE( VipsInterpolateBilinear, vips_interpolate_bilinear,
 	VIPS_TYPE_INTERPOLATE );
 
-/* Precalculated interpolation matricies. int (used for pel sizes up
- * to short), and float (for all others). We go to scale + 1 so
- * we can round-to-nearest safely. Don't bother with double, since
- * this is an approximation anyway.
+/* Precalculated interpolation matricies, only for int types.
+ * We go to scale + 1 so
+ * we can round-to-nearest safely. 
  */
 static int vips_bilinear_matrixi
 	[VIPS_TRANSFORM_SCALE + 1][VIPS_TRANSFORM_SCALE + 1][4];
-static float vips_bilinear_matrixd
-	[VIPS_TRANSFORM_SCALE + 1][VIPS_TRANSFORM_SCALE + 1][4];
 
 /* in this class, name vars in the 2x2 grid as eg.
  * p1  p2
@@ -344,6 +344,12 @@ static float vips_bilinear_matrixd
 #define BILINEAR_INT( TYPE ) { \
 	TYPE *tq = (TYPE *) out; \
  	\
+	const int six = sx & (VIPS_TRANSFORM_SCALE * 2 - 1); \
+	const int siy = sy & (VIPS_TRANSFORM_SCALE * 2 - 1); \
+	\
+	const int tx = (six + 1) >> 1; \
+	const int ty = (siy + 1) >> 1; \
+	\
 	const int c1 = vips_bilinear_matrixi[tx][ty][0]; \
 	const int c2 = vips_bilinear_matrixi[tx][ty][1]; \
 	const int c3 = vips_bilinear_matrixi[tx][ty][2]; \
@@ -364,12 +370,18 @@ static float vips_bilinear_matrixd
  */
 #define BILINEAR_FLOAT( TYPE ) { \
 	TYPE *tq = (TYPE *) out; \
- 	\
-	const double c1 = vips_bilinear_matrixd[tx][ty][0]; \
-	const double c2 = vips_bilinear_matrixd[tx][ty][1]; \
-	const double c3 = vips_bilinear_matrixd[tx][ty][2]; \
-	const double c4 = vips_bilinear_matrixd[tx][ty][3]; \
 	\
+	float X = x - ix; \
+	float Y = y - iy; \
+	\
+	float Xd = 1.0 - X; \
+	float Yd = 1.0 - Y; \
+	\
+	float c1 = Xd * Yd; \
+	float c2 = X * Yd; \
+	float c3 = Xd * Y; \
+	float c4 = X * Y; \
+ 	\
 	const TYPE *tp1 = (TYPE *) p1; \
 	const TYPE *tp2 = (TYPE *) p2; \
 	const TYPE *tp3 = (TYPE *) p3; \
@@ -408,6 +420,10 @@ vips_interpolate_bilinear_interpolate( VipsInterpolate *interpolate,
 	const int ls = IM_REGION_LSKIP( in );
 	const int b = in->im->Bands;
 
+	/* We want ((int)x), but the tables versions needs to find a mask
+	 * index quickly from the residual. Calculate both.
+	 */
+
 	/* Find the mask index. We round-to-nearest, so we need to generate 
 	 * indexes in 0 to VIPS_TRANSFORM_SCALE, 2^n + 1 values. We multiply 
 	 * by 2 more than we need to, add one, mask, then shift down again to 
@@ -416,13 +432,7 @@ vips_interpolate_bilinear_interpolate( VipsInterpolate *interpolate,
 	const int sx = x * VIPS_TRANSFORM_SCALE * 2;
 	const int sy = y * VIPS_TRANSFORM_SCALE * 2;
 
-	const int six = sx & (VIPS_TRANSFORM_SCALE * 2 - 1);
-	const int siy = sy & (VIPS_TRANSFORM_SCALE * 2 - 1);
-
-	const int tx = (six + 1) >> 1;
-	const int ty = (siy + 1) >> 1;
-
-	/* We want ((int)x) ... void repeating this double -> int conversion 
+	/* We want ((int)x) ... avoid repeating this double -> int conversion 
 	 * by just shifting sx down.
 	 */
 	const int ix = sx >> (VIPS_TRANSFORM_SHIFT + 1);
@@ -474,11 +484,6 @@ vips_interpolate_bilinear_class_init( VipsInterpolateBilinearClass *class )
 			c3 = Xd * Y;
 			c4 = X * Y;
 
-			vips_bilinear_matrixd[x][y][0] = c1;
-			vips_bilinear_matrixd[x][y][1] = c2;
-			vips_bilinear_matrixd[x][y][2] = c3;
-			vips_bilinear_matrixd[x][y][3] = c4;
-
 			vips_bilinear_matrixi[x][y][0] = 
 				c1 * VIPS_INTERPOLATE_SCALE;
 			vips_bilinear_matrixi[x][y][1] =