/* convi
 *
 * Copyright: 1990, N. Dessipris.
 *
 * Author: Nicos Dessipris & Kirk Martinez
 * Written on: 29/04/1991
 * Modified on: 19/05/1991
 * 8/7/93 JC
 *      - adapted for partial v2
 *      - memory leaks fixed
 *      - ANSIfied
 * 23/7/93 JC
 *	- inner loop unrolled with a switch - 25% speed-up!
 * 13/12/93 JC
 *	- tiny rounding error removed
 * 7/10/94 JC
 *	- new IM_ARRAY() macro
 *	- various simplifications
 *	- evalend callback added
 * 1/2/95 JC
 *	- use of IM_REGION_ADDR() updated
 *	- output size was incorrect! see comment below
 *	- bug with large non-square matricies fixed too
 *	- uses new im_embed() function
 * 13/7/98 JC
 *	- wierd bug ... im_free_imask is no longer directly called for close
 *	  callback, caused SIGKILL on solaris 2.6 ... linker bug?
 * 9/3/01 JC
 *	- reworked and simplified, about 10% faster
 *	- slightly better range clipping
 * 27/7/01 JC
 *	- reject masks with scale == 0
 * 7/4/04 
 *	- im_conv() now uses im_embed() with edge stretching on the input, not
 *	  the output
 *	- sets Xoffset / Yoffset
 * 11/11/05
 * 	- simpler inner loop avoids gcc4 bug 
 * 7/11/07
 * 	- new evalstart/end callbacks
 * 12/5/08
 * 	- int rounding was +1 too much, argh
 * 	- only rebuild the buffer offsets if bpl changes
 * 5/4/09
 * 	- tiny speedups and cleanups
 * 	- add restrict, though it doesn't seem to help gcc
 * 12/11/09
 * 	- only check for non-zero elements once
 * 	- add mask-all-zero check
 * 	- cleanups
 * 3/2/10
 * 	- gtkdoc
 * 	- more cleanups
 * 23/08/10
 * 	- add a special case for 3x3 masks, about 20% faster
 * 1/10/10
 * 	- support complex (just double the bands)
 * 18/10/10
 * 	- add experimental Orc path
 * 29/10/10
 * 	- use VipsVector
 * 	- get rid of im_convsep(), just call this twice, no longer worth
 * 	  keeping two versions
 * 8/11/10
 * 	- add array tiling
 * 9/5/11
 * 	- argh typo in overflow estimation could cause errors
 * 15/10/11 Nicolas
 * 	- handle offset correctly in seperable convolutions
 * 26/1/16 Lovell Fuller
 * 	- remove Duff for a 25% speedup
 * 23/6/16
 * 	- rewritten as a class
 * 	- new fixed-point vector path, up to 2x faster
 * 2/7/17
 * 	- remove pts for a small speedup
 * 12/10/17
 * 	- fix leak of vectors, thanks MHeimbuc 
 * 14/10/17
 * 	- switch to half-float for vector path
 */

/*

    This file is part of VIPS.
    
    VIPS is free software; you can redistribute it and/or modify
    it under the terms of the GNU Lesser General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
    02110-1301  USA

 */

/*

    These files are distributed with VIPS - http://www.vips.ecs.soton.ac.uk

 */

/* 
#define DEBUG
#define DEBUG_PIXELS
#define DEBUG_COMPILE
 */

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif /*HAVE_CONFIG_H*/
#include <vips/intl.h>

#include <stdio.h>
#include <stdlib.h>
#include <limits.h>

#include <vips/vips.h>
#include <vips/internal.h>

#include "pconvolution.h"

/* Larger than this and we fall back to C.
 */
#define MAX_PASS (20)

/* A pass with a vector. 
 */
typedef struct {
	int first;		/* The index of the first mask coff we use */
	int last;		/* The index of the last mask coff we use */

	int r;			/* Set previous result in this var */

        /* The code we generate for this section of the mask. 
	 */
        VipsVector *vector;
} Pass;

typedef struct {
	VipsConvolution parent_instance;

	int n_point;		/* w * h for our matrix */

	/* We make a smaller version of the mask with the zeros squeezed out.
	 */
	int nnz;		/* Number of non-zero mask elements */
	int *coeff;		/* Array of non-zero mask coefficients */
	int *coeff_pos;		/* Index of each nnz element in mask->coeff */

	/* And a half float version for the vector path. mant has the signed
	 * 8-bit mantissas in [-1, +1), sexp has the exponent shift after the
	 * mul and before the add, and exp has the final exponent shift before
	 * write-back.
	 */
	int *mant;
	int sexp;
	int exp;

	/* The set of passes we need for this mask.
	 */
	int n_pass;	
	Pass pass[MAX_PASS];

	/* Code for the final clip back to 8 bits.
	 */
	int r;			
        VipsVector *vector;
} VipsConvi;

typedef VipsConvolutionClass VipsConviClass;

G_DEFINE_TYPE( VipsConvi, vips_convi, VIPS_TYPE_CONVOLUTION );

/* Our sequence value.
 */
typedef struct {
	VipsConvi *convi;
	VipsRegion *ir;		/* Input region */

	int *offsets;		/* Offsets for each non-zero matrix element */

	int last_bpl;		/* Avoid recalcing offsets, if we can */

	/* We need a pair of intermediate buffers to keep the results of each
	 * vector conv pass. 
	 */
	short *t1;
	short *t2;
} VipsConviSequence;

static void
vips_convi_compile_free( VipsConvi *convi )
{
	int i;

	for( i = 0; i < convi->n_pass; i++ )
		VIPS_FREEF( vips_vector_free, convi->pass[i].vector );
	convi->n_pass = 0;
	VIPS_FREEF( vips_vector_free, convi->vector );
}

static void
vips_convi_dispose( GObject *gobject )
{
	VipsConvi *convi = (VipsConvi *) gobject;

#ifdef DEBUG
	printf( "vips_convi_dispose: " );
	vips_object_print_name( VIPS_OBJECT( gobject ) );
	printf( "\n" );
#endif /*DEBUG*/

	vips_convi_compile_free( convi ); 

	G_OBJECT_CLASS( vips_convi_parent_class )->dispose( gobject );
}

/* Free a sequence value.
 */
static int
vips_convi_stop( void *vseq, void *a, void *b )
{
	VipsConviSequence *seq = (VipsConviSequence *) vseq;

	VIPS_UNREF( seq->ir );
	VIPS_FREE( seq->offsets );
	VIPS_FREE( seq->t1 );
	VIPS_FREE( seq->t2 );

	return( 0 );
}

/* Convolution start function.
 */
static void *
vips_convi_start( VipsImage *out, void *a, void *b )
{
	VipsImage *in = (VipsImage *) a;
	VipsConvi *convi = (VipsConvi *) b;
	VipsConviSequence *seq;

	if( !(seq = VIPS_NEW( out, VipsConviSequence )) )
		return( NULL );

	seq->convi = convi;
	seq->ir = NULL;
	seq->offsets = NULL;
	seq->last_bpl = -1;
	seq->t1 = NULL;
	seq->t2 = NULL;

	seq->ir = vips_region_new( in );

	/* C mode.
	 */
	if( convi->nnz ) {
		if( !(seq->offsets = VIPS_ARRAY( NULL, convi->nnz, int )) ) { 
			vips_convi_stop( seq, in, convi );
			return( NULL );
		}
	}

	/* Vector mode.
	 */
	if( convi->n_pass ) {
		seq->t1 = VIPS_ARRAY( NULL, VIPS_IMAGE_N_ELEMENTS( in ), short );
		seq->t2 = VIPS_ARRAY( NULL, VIPS_IMAGE_N_ELEMENTS( in ), short );

		if( !seq->t1 || 
			!seq->t2 ) {
			vips_convi_stop( seq, in, convi );
			return( NULL );
		}
	}

	return( (void *) seq );
}

#define TEMP( N, S ) vips_vector_temporary( v, (char *) N, S )
#define PARAM( N, S ) vips_vector_parameter( v, (char *) N, S )
#define SCANLINE( N, P, S ) vips_vector_source_scanline( v, (char *) N, P, S )
#define CONST( N, V, S ) vips_vector_constant( v, (char *) N, V, S )
#define ASM2( OP, A, B ) vips_vector_asm2( v, (char *) OP, A, B )
#define ASM3( OP, A, B, C ) vips_vector_asm3( v, (char *) OP, A, B, C )

/* Generate code for a section of the mask. first is the index we start
 * at, we set last to the index of the last one we use before we run 
 * out of intermediates / constants / parameters / sources or mask
 * coefficients.
 *
 * 0 for success, -1 on error.
 */
static int
vips_convi_compile_section( VipsConvi *convi, VipsImage *in, Pass *pass )
{
	VipsConvolution *convolution = (VipsConvolution *) convi;
	VipsImage *M = convolution->M;

	VipsVector *v;
	int i;

#ifdef DEBUG_COMPILE
	printf( "starting pass %d\n", pass->first ); 
#endif /*DEBUG_COMPILE*/

	pass->vector = v = vips_vector_new( "convi", 2 );

	/* "r" is the array of sums from the previous pass (if any).
	 */
	pass->r = vips_vector_source_name( v, "r", 2 );

	/* The value we fetch from the image, the accumulated sum.
	 */
	TEMP( "value", 2 );
	TEMP( "valueb", 1 );
	TEMP( "sum", 2 );

	/* Init the sum. If this is the first pass, it's a constant. If this
	 * is a later pass, we have to init the sum from the result 
	 * of the previous pass. 
	 */
	if( pass->first == 0 ) {
		char c0[256];

		CONST( c0, 0, 2 );
		ASM2( "loadpw", "sum", c0 );
	}
	else 
		ASM2( "loadw", "sum", "r" );

	for( i = pass->first; i < convi->n_point; i++ ) {
		int x = i % M->Xsize;
		int y = i / M->Xsize;

		char source[256];
		char off[256];
		char rnd[256];
		char sexp[256];
		char coeff[256];

		/* Exclude zero elements.
		 */
		if( !convi->mant[i] )
			continue;

		/* The source. sl0 is the first scanline in the mask.
		 */
		SCANLINE( source, y, 1 );

		/* Load with an offset. Only for non-first-columns though.
		 */
		if( x == 0 ) 
			ASM2( "convubw", "value", source );
		else {
			CONST( off, in->Bands * x, 1 );
			ASM3( "loadoffb", "valueb", source, off );
			ASM2( "convubw", "value", "valueb" );
		}

		/* We need a signed multiply, so the image pixel needs to
		 * become a signed 16-bit value. We know only the bottom 8 bits
		 * of the image and coefficient are interesting, so we can take
		 * the bottom half of a 16x16->32 multiply. 
		 */
		CONST( coeff, convi->mant[i], 2 );
		ASM3( "mullw", "value", "value", coeff );

		/* Shift right before add to prevent overflow on large masks.
		 */
		CONST( sexp, convi->sexp, 2 );
		CONST( rnd, 1 << (convi->sexp - 1), 2 );
		ASM3( "addw", "value", "value", rnd );
		ASM3( "shrsw", "value", "value", sexp );

		/* We accumulate the signed 16-bit result in sum. Saturated
		 * add. 
		 */
		ASM3( "addssw", "sum", "sum", "value" );

		if( vips_vector_full( v ) )
			break;
	}

	pass->last = i;

	/* And write to our intermediate buffer.
	 */
	ASM2( "copyw", "d1", "sum" );

	if( !vips_vector_compile( v ) ) 
		return( -1 );

#ifdef DEBUG_COMPILE
	printf( "done coeffs %d to %d\n", pass->first, pass->last );
	vips_vector_print( v );
#endif /*DEBUG_COMPILE*/

	return( 0 );
}

/* Generate code for the final 16->8 conversion. 
 *
 * 0 for success, -1 on error.
 */
static int
vips_convi_compile_clip( VipsConvi *convi )
{
	VipsConvolution *convolution = (VipsConvolution *) convi;
	VipsImage *M = convolution->M;
	int offset = VIPS_RINT( vips_image_get_offset( M ) );

	VipsVector *v;
	char rnd[256];
	char exp[256];
	char c0[256];
	char c255[256];
	char off[256];

	convi->vector = v = vips_vector_new( "convi", 1 );

	/* "r" is the array of sums we clip down. 
	 */
	convi->r = vips_vector_source_name( v, "r", 2 );

	/* The value we fetch from the image.
	 */
	TEMP( "value", 2 );

	CONST( rnd, 1 << (convi->exp - 1), 2 );
	ASM3( "addw", "value", "r", rnd );
	CONST( exp, convi->exp, 2 );
	ASM3( "shrsw", "value", "value", exp );

	CONST( off, offset, 2 ); 
	ASM3( "addw", "value", "value", off );

	/* You'd think "convsuswb" (convert signed 16-bit to unsigned
	 * 8-bit with saturation) would be quicker, but it's a lot
	 * slower.
	 */
	CONST( c0, 0, 2 );
	ASM3( "maxsw", "value", c0, "value" ); 
	CONST( c255, 255, 2 );
	ASM3( "minsw", "value", c255, "value" ); 

	ASM2( "convwb", "d1", "value" );

	if( !vips_vector_compile( v ) ) 
		return( -1 );

	return( 0 );
}

static int
vips_convi_compile( VipsConvi *convi, VipsImage *in )
{
	int i;
	Pass *pass;

	/* Generate passes until we've used up the whole mask.
	 */
	for( i = 0;; ) {
		/* Allocate space for another pass.
		 */
		if( convi->n_pass == MAX_PASS ) 
			return( -1 );
		pass = &convi->pass[convi->n_pass];
		convi->n_pass += 1;

		pass->first = i;
		pass->r = -1;

		if( vips_convi_compile_section( convi, in, pass ) )
			return( -1 );
		i = pass->last + 1;

		if( i >= convi->n_point )
			break;
	}

	if( vips_convi_compile_clip( convi ) )
		return( -1 );

	return( 0 );
}

static int
vips_convi_gen_vector( VipsRegion *or, 
	void *vseq, void *a, void *b, gboolean *stop )
{
	VipsConviSequence *seq = (VipsConviSequence *) vseq;
	VipsConvi *convi = (VipsConvi *) b;
	VipsConvolution *convolution = (VipsConvolution *) convi;
	VipsImage *M = convolution->M;
	VipsImage *in = (VipsImage *) a;
	VipsRegion *ir = seq->ir;
	VipsRect *r = &or->valid;
	int ne = r->width * in->Bands;

	VipsRect s;
	int i, y;
	VipsExecutor executor[MAX_PASS];
	VipsExecutor clip;

#ifdef DEBUG_PIXELS
	printf( "vips_convi_gen_vector: generating %d x %d at %d x %d\n",
		r->width, r->height, r->left, r->top ); 
#endif /*DEBUG_PIXELS*/

	/* Prepare the section of the input image we need. A little larger
	 * than the section of the output image we are producing.
	 */
	s = *r;
	s.width += M->Xsize - 1;
	s.height += M->Ysize - 1;
	if( vips_region_prepare( ir, &s ) )
		return( -1 );

	for( i = 0; i < convi->n_pass; i++ ) 
		vips_executor_set_program( &executor[i], 
			convi->pass[i].vector, ne );
	vips_executor_set_program( &clip, convi->vector, ne );

	VIPS_GATE_START( "vips_convi_gen_vector: work" ); 

	for( y = 0; y < r->height; y ++ ) { 
		VipsPel *q = VIPS_REGION_ADDR( or, r->left, r->top + y );
	
#ifdef DEBUG_PIXELS
{
		int h, v;

		printf( "before convolve: x = %d, y = %d\n", 
			r->left, r->top + y );
		for( v = 0; v < M->Ysize; v++ ) {
			for( h = 0; h < M->Xsize; h++ )
				printf( "%3d ", *VIPS_REGION_ADDR( ir, 
					r->left + h, r->top + y + v ) );
			printf( "\n" );
		}
}
#endif /*DEBUG_PIXELS*/

		/* We run our n passes to generate this scanline.
		 */
		for( i = 0; i < convi->n_pass; i++ ) {
			Pass *pass = &convi->pass[i]; 

			vips_executor_set_scanline( &executor[i], 
				ir, r->left, r->top + y );
			vips_executor_set_array( &executor[i],
				pass->r, seq->t1 );
			vips_executor_set_destination( &executor[i], seq->t2 );
			vips_executor_run( &executor[i] );

			VIPS_SWAP( signed short *, seq->t1, seq->t2 );
		}

#ifdef DEBUG_PIXELS
		printf( "before clip: %d\n", ((signed short *) seq->t1)[0] );
#endif /*DEBUG_PIXELS*/

		vips_executor_set_array( &clip, convi->r, seq->t1 );
		vips_executor_set_destination( &clip, q ); 
		vips_executor_run( &clip );

#ifdef DEBUG_PIXELS
		printf( "after clip: %d\n", 
			*VIPS_REGION_ADDR( or, r->left, r->top + y ) );
#endif /*DEBUG_PIXELS*/
	}

	VIPS_GATE_STOP( "vips_convi_gen_vector: work" ); 

	VIPS_COUNT_PIXELS( or, "vips_convi_gen_vector" ); 

	return( 0 );
}

/* INT inner loops.
 */
#define CONV_INT( TYPE, CLIP ) { \
	TYPE * restrict p = (TYPE *) VIPS_REGION_ADDR( ir, le, y ); \
	TYPE * restrict q = (TYPE *) VIPS_REGION_ADDR( or, le, y ); \
	int * restrict offsets = seq->offsets; \
	\
	for( x = 0; x < sz; x++ ) {  \
		int sum; \
		int i; \
		\
		sum = 0; \
		for ( i = 0; i < nnz; i++ ) \
			sum += t[i] * p[offsets[i]]; \
		\
		sum = ((sum + rounding) / scale) + offset; \
		\
		CLIP; \
		\
		q[x] = sum;  \
		p += 1; \
	}  \
} 

/* FLOAT inner loops.
 */
#define CONV_FLOAT( TYPE ) { \
	TYPE * restrict p = (TYPE *) VIPS_REGION_ADDR( ir, le, y ); \
	TYPE * restrict q = (TYPE *) VIPS_REGION_ADDR( or, le, y ); \
	int * restrict offsets = seq->offsets; \
	\
	for( x = 0; x < sz; x++ ) {  \
		double sum; \
		int i; \
		\
		sum = 0; \
		for ( i = 0; i < nnz; i++ ) \
			sum += t[i] * p[offsets[i]]; \
 		\
		sum = (sum / scale) + offset; \
		\
		q[x] = sum;  \
		p += 1; \
	}  \
} 

/* Various integer range clips. Record over/under flows.
 */
#define CLIP_UCHAR( V ) \
G_STMT_START { \
	if( (V) < 0 ) \
		(V) = 0; \
	else if( (V) > UCHAR_MAX ) \
		(V) = UCHAR_MAX; \
} G_STMT_END

#define CLIP_CHAR( V ) \
G_STMT_START { \
	if( (V) < SCHAR_MIN ) \
		(V) = SCHAR_MIN; \
	else if( (V) > SCHAR_MAX ) \
		(V) = SCHAR_MAX; \
} G_STMT_END

#define CLIP_USHORT( V ) \
G_STMT_START { \
	if( (V) < 0 ) \
		(V) = 0; \
	else if( (V) > USHRT_MAX ) \
		(V) = USHRT_MAX; \
} G_STMT_END

#define CLIP_SHORT( V ) \
G_STMT_START { \
	if( (V) < SHRT_MIN ) \
		(V) = SHRT_MIN; \
	else if( (V) > SHRT_MAX ) \
		(V) = SHRT_MAX; \
} G_STMT_END

#define CLIP_NONE( V ) {}

/* Convolve!
 */
static int
vips_convi_gen( VipsRegion *or, 
	void *vseq, void *a, void *b, gboolean *stop )
{
	VipsConviSequence *seq = (VipsConviSequence *) vseq;
	VipsConvi *convi = (VipsConvi *) b;
	VipsConvolution *convolution = (VipsConvolution *) convi;
	VipsImage *M = convolution->M;
	int scale = VIPS_RINT( vips_image_get_scale( M ) ); 
	int rounding = scale / 2;
	int offset = VIPS_RINT( vips_image_get_offset( M ) ); 
	VipsImage *in = (VipsImage *) a;
	VipsRegion *ir = seq->ir;
	int * restrict t = convi->coeff; 
	const int nnz = convi->nnz;
	VipsRect *r = &or->valid;
	int le = r->left;
	int to = r->top;
	int bo = VIPS_RECT_BOTTOM( r );
	int sz = VIPS_REGION_N_ELEMENTS( or ) * 
		(vips_band_format_iscomplex( in->BandFmt ) ? 2 : 1);

	VipsRect s;
	int x, y, z, i;

	/* Prepare the section of the input image we need. A little larger
	 * than the section of the output image we are producing.
	 */
	s = *r;
	s.width += M->Xsize - 1;
	s.height += M->Ysize - 1;
	if( vips_region_prepare( ir, &s ) )
		return( -1 );

        /* Fill offset array. Only do this if the bpl has changed since the 
	 * previous vips_region_prepare().
	 */
	if( seq->last_bpl != VIPS_REGION_LSKIP( ir ) ) {
		seq->last_bpl = VIPS_REGION_LSKIP( ir );

		for( i = 0; i < nnz; i++ ) {
			z = convi->coeff_pos[i];
			x = z % M->Xsize;
			y = z / M->Xsize;

			seq->offsets[i] = 
				(VIPS_REGION_ADDR( ir, x + le, y + to ) -
				 VIPS_REGION_ADDR( ir, le, to )) / 
					VIPS_IMAGE_SIZEOF_ELEMENT( ir->im ); 
		}
	}

	VIPS_GATE_START( "vips_convi_gen: work" ); 

	for( y = to; y < bo; y++ ) { 
		switch( in->BandFmt ) {
		case VIPS_FORMAT_UCHAR: 	
			CONV_INT( unsigned char, CLIP_UCHAR( sum ) ); 
			break;

		case VIPS_FORMAT_CHAR:   
			CONV_INT( signed char, CLIP_CHAR( sum ) ); 
			break;

		case VIPS_FORMAT_USHORT: 
			CONV_INT( unsigned short, CLIP_USHORT( sum ) ); 
			break;

		case VIPS_FORMAT_SHORT:  
			CONV_INT( signed short, CLIP_SHORT( sum ) ); 
			break;

		case VIPS_FORMAT_UINT:   
			CONV_INT( unsigned int, CLIP_NONE( sum ) ); 
			break;

		case VIPS_FORMAT_INT:    
			CONV_INT( signed int, CLIP_NONE( sum ) ); 
			break;

		case VIPS_FORMAT_FLOAT:  
		case VIPS_FORMAT_COMPLEX:  
			CONV_FLOAT( float ); 
			break;

		case VIPS_FORMAT_DOUBLE: 
		case VIPS_FORMAT_DPCOMPLEX:  
			CONV_FLOAT( double ); 
			break;

		default:
			g_assert_not_reached();
		}
	}

	VIPS_GATE_STOP( "vips_convi_gen: work" ); 

	VIPS_COUNT_PIXELS( or, "vips_convi_gen" ); 

	return( 0 );
}

/* Make an int version of a mask.
 *
 * We rint() everything, then adjust the scale try to match the overall
 * effect.
 */
int
vips__image_intize( VipsImage *in, VipsImage **out )
{
	VipsImage *t;
	int x, y;
	double double_result;
	double out_scale;
	double out_offset;
	int int_result;

	if( vips_check_matrix( "vips2imask", in, &t ) )
		return( -1 ); 
	if( !(*out = vips_image_new_matrix( t->Xsize, t->Ysize )) ) {
		g_object_unref( t ); 
		return( -1 ); 
	}

	/* We want to make an intmask which has the same input to output ratio
	 * as the double image.
	 *
	 * Imagine convolving with the double image, what's the ratio of
	 * brightness between input and output? We want the same ratio for the
	 * int version, if we can.
	 *
	 * Imagine an input image where every pixel is 1, what will the output
	 * be?
	 */
	double_result = 0;
	for( y = 0; y < t->Ysize; y++ )
		for( x = 0; x < t->Xsize; x++ )
			double_result += *VIPS_MATRIX( t, x, y ); 
	double_result /= vips_image_get_scale( t );

	for( y = 0; y < t->Ysize; y++ )
		for( x = 0; x < t->Xsize; x++ )
			*VIPS_MATRIX( *out, x, y ) = 
				VIPS_RINT( *VIPS_MATRIX( t, x, y ) );

	out_scale = VIPS_RINT( vips_image_get_scale( t ) );
	if( out_scale == 0 )
		out_scale = 1;
	out_offset = VIPS_RINT( vips_image_get_offset( t ) );

	/* Now convolve a 1 everywhere image with the int version we've made,
	 * what do we get?
	 */
	int_result = 0;
	for( y = 0; y < t->Ysize; y++ )
		for( x = 0; x < t->Xsize; x++ )
			int_result += *VIPS_MATRIX( *out, x, y ); 
	int_result /= out_scale;

	/* And adjust the scale to get as close to a match as we can. 
	 */
	out_scale = VIPS_RINT( out_scale + (int_result - double_result) );
	if( out_scale == 0 ) 
		out_scale = 1;

	vips_image_set_double( *out, "scale", out_scale );
	vips_image_set_double( *out, "offset", out_offset );

	g_object_unref( t ); 

	return( 0 );
}

/* Make an int version of a mask. Each element is 8.8 float, with the same
 * exponent for each element (so just 8 bits in @out).
 *
 * @out is a w x h int array.
 */
static int
vips_convi_intize( VipsConvi *convi, VipsImage *M )
{
	VipsImage *t;
	double scale;
	double *scaled;
	double mx;
	double mn;
	int shift;
	int i;

	if( vips_check_matrix( "vips2imask", M, &t ) )
		return( -1 ); 

	/* Bake the scale into the mask to make a double version.
	 */
	scale = vips_image_get_scale( t );
        if( !(scaled = VIPS_ARRAY( convi, convi->n_point, double )) ) {
		g_object_unref( t ); 
		return( -1 );
	}
	for( i = 0; i < convi->n_point; i++ ) 
		scaled[i] = VIPS_MATRIX( t, 0, 0 )[i] / scale;
	g_object_unref( t ); 

#ifdef DEBUG_COMPILE
{
	int x, y;

	printf( "vips_convi_intize: double version\n" ); 
	for( y = 0; y < t->Ysize; y++ ) {
		printf( "\t" ); 
		for( x = 0; x < t->Xsize; x++ ) 
			printf( "%g ", scaled[y * t->Xsize + x] ); 
		printf( "\n" ); 
	}
}
#endif /*DEBUG_COMPILE*/

	mx = scaled[0];
	mn = scaled[0];
	for( i = 1; i < convi->n_point; i++ ) {
		if( scaled[i] > mx )
			mx = scaled[i];
		if( scaled[i] < mn )
			mn = scaled[i];
	}

	/* The mask max rounded up to the next power of two gives the exponent
	 * all elements share. Values are eg. -3 for 1/8, 3 for 8.
	 *
	 * Add one so we round up stuff exactly on x.0. We multiply by 128
	 * later, so 1.0 (for example) would become 128, which is outside
	 * signed 8 bit. 
	 */
	shift = ceil( log2( mx ) + 1 );

	/* We need to sum n_points, so we have to shift right before adding a
	 * new value to make sure we have enough range. 
	 */
	convi->sexp = ceil( log2( convi->n_point ) );
	if( convi->sexp > 10 ) {
		g_info( "vips_convi_intize: mask too large" ); 
		return( -1 ); 
	}

	/* With that already done, the final shift must be ...
	 */
	convi->exp = 7 - shift - convi->sexp;

	if( !(convi->mant = VIPS_ARRAY( convi, convi->n_point, int )) )
		return( -1 );
	for( i = 0; i < convi->n_point; i++ ) {
		/* 128 since this is signed. 
		 */
		convi->mant[i] = VIPS_RINT( 128 * scaled[i] * pow(2, -shift) );

		if( convi->mant[i] < -128 ||
			convi->mant[i] > 127 ) {
			g_info( "vips_convi_intize: mask range too large" ); 
			return( -1 );
		}
	}

#ifdef DEBUG_COMPILE
{
	int x, y;

	printf( "vips_convi_intize:\n" ); 
	printf( "sexp = %d\n", convi->sexp ); 
	printf( "exp = %d\n", convi->exp ); 
	for( y = 0; y < t->Ysize; y++ ) {
		printf( "\t" ); 
		for( x = 0; x < t->Xsize; x++ ) 
			printf( "%4d ", convi->mant[y * t->Xsize + x] ); 
		printf( "\n" ); 
	}
}
#endif /*DEBUG_COMPILE*/

	/* Verify accuracy.
	 */
{
	double true_sum;
	int int_sum;
	int true_value;
	int int_value;

	true_sum = 0.0;
	int_sum = 0;
	for( i = 0; i < convi->n_point; i++ ) {
		int value;

		true_sum += 128 * scaled[i];
		value = 128 * convi->mant[i];
		value = (value + (1 << (convi->sexp - 1))) >> convi->sexp;
		int_sum += value;
		int_sum = VIPS_CLIP( SHRT_MIN, int_sum, SHRT_MAX ); 
	}

	true_value = VIPS_CLIP( 0, true_sum, 255 ); 

	if( convi->exp > 0 )
		int_value = (int_sum + (1 << (convi->exp - 1))) >> convi->exp;
	else
		int_value = VIPS_LSHIFT_INT( int_sum, convi->exp );
	int_value = VIPS_CLIP( 0, int_value, 255 ); 

	if( VIPS_ABS( true_value - int_value ) > 2 ) {
		g_info( "vips_convi_intize: too inaccurate" );
		return( -1 ); 
	}
}

	return( 0 );
}

static int
vips_convi_build( VipsObject *object )
{
	VipsConvolution *convolution = (VipsConvolution *) object;
	VipsConvi *convi = (VipsConvi *) object;
	VipsImage **t = (VipsImage **) vips_object_local_array( object, 4 );

	VipsImage *in;
	VipsImage *M;
	VipsGenerateFn generate;
	double *coeff;
        int i;

	if( VIPS_OBJECT_CLASS( vips_convi_parent_class )->build( object ) )
		return( -1 );

	in = convolution->in;
	M = convolution->M;
	convi->n_point = M->Xsize * M->Ysize;

	if( vips_embed( in, &t[0], 
		M->Xsize / 2, M->Ysize / 2, 
		in->Xsize + M->Xsize - 1, in->Ysize + M->Ysize - 1,
		"extend", VIPS_EXTEND_COPY,
		NULL ) )
		return( -1 );
	in = t[0]; 

	/* Default to the C path.
	 */
	generate = vips_convi_gen;

	/* For uchar input, try to make a vector path.
	 */
	if( vips_vector_isenabled() &&
		in->BandFmt == VIPS_FORMAT_UCHAR ) {
		if( !vips_convi_intize( convi, M ) &&
			!vips_convi_compile( convi, in ) ) {
			generate = vips_convi_gen_vector;
			g_info( "convi: using vector path" ); 
		}
		else
			vips_convi_compile_free( convi );
	}

	/* Make the data for the C path.
	 */
	if( generate == vips_convi_gen ) { 
		g_info( "convi: using C path" ); 

		/* Make an int version of our mask.
		 */
		if( vips__image_intize( M, &t[1] ) )
			return( -1 ); 
		M = t[1];

		coeff = VIPS_MATRIX( M, 0, 0 ); 
		if( !(convi->coeff = VIPS_ARRAY( object, convi->n_point, int )) ||
			!(convi->coeff_pos = 
				VIPS_ARRAY( object, convi->n_point, int )) )
			return( -1 );

		/* Squeeze out zero mask elements. 
		 */
		convi->nnz = 0;
		for( i = 0; i < convi->n_point; i++ )
			if( coeff[i] ) {
				convi->coeff[convi->nnz] = coeff[i];
				convi->coeff_pos[convi->nnz] = i;
				convi->nnz += 1;
			}

		/* Was the whole mask zero? We must have at least 1 element 
		 * in there: set it to zero.
		 */
		if( convi->nnz == 0 ) {
			convi->coeff[0] = 0;
			convi->coeff_pos[0] = 0;
			convi->nnz = 1;
		}
	}

	g_object_set( convi, "out", vips_image_new(), NULL ); 
	if( vips_image_pipelinev( convolution->out, 
		VIPS_DEMAND_STYLE_SMALLTILE, in, NULL ) )
		return( -1 );

	/* Prepare output. Consider a 7x7 mask and a 7x7 image --- the output
	 * would be 1x1.
	 */
	convolution->out->Xsize -= M->Xsize - 1;
	convolution->out->Ysize -= M->Ysize - 1;

	if( vips_image_generate( convolution->out, 
		vips_convi_start, generate, vips_convi_stop, in, convi ) )
		return( -1 );

	convolution->out->Xoffset = -M->Xsize / 2;
	convolution->out->Yoffset = -M->Ysize / 2;

	return( 0 );
}

static void
vips_convi_class_init( VipsConviClass *class )
{
	GObjectClass *gobject_class = G_OBJECT_CLASS( class );
	VipsObjectClass *object_class = (VipsObjectClass *) class;

	gobject_class->dispose = vips_convi_dispose;

	object_class->nickname = "convi";
	object_class->description = _( "int convolution operation" );
	object_class->build = vips_convi_build;
}

static void
vips_convi_init( VipsConvi *convi )
{
        convi->nnz = 0;
        convi->coeff = NULL;
        convi->coeff_pos = NULL;
}

/**
 * vips_convi: (method)
 * @in: input image
 * @out: (out): output image
 * @mask: convolve with this mask
 * @...: %NULL-terminated list of optional named arguments
 *
 * Integer convolution. This is a low-level operation, see vips_conv() for 
 * something more convenient. 
 *
 * @mask is converted to an integer mask with rint() of each element, rint of
 * scale and rint of offset. Each output pixel is then calculated as 
 *
 * |[
 * sigma[i]{pixel[i] * mask[i]} / scale + offset
 * ]|
 *
 * The output image always has the same #VipsBandFormat as the input image. 
 *
 * For #VIPS_FORMAT_UCHAR images, vips_convi() uses a fast vector path based on
 * half-float arithmetic. This can produce slightly different results. 
 * Disable the vector path with `--vips-novector` or `VIPS_NOVECTOR` or
 * vips_vector_set_enabled().
 *
 * See also: vips_conv().
 *
 * Returns: 0 on success, -1 on error
 */
int 
vips_convi( VipsImage *in, VipsImage **out, VipsImage *mask, ... )
{
	va_list ap;
	int result;

	va_start( ap, mask );
	result = vips_call_split( "convi", ap, in, out, mask );
	va_end( ap );

	return( result );
}