1107 lines
25 KiB
C
1107 lines
25 KiB
C
/* im_conv
|
|
*
|
|
* Copyright: 1990, N. Dessipris.
|
|
*
|
|
* Author: Nicos Dessipris & Kirk Martinez
|
|
* Written on: 29/04/1991
|
|
* Modified on: 19/05/1991
|
|
* 8/7/93 JC
|
|
* - adapted for partial v2
|
|
* - memory leaks fixed
|
|
* - ANSIfied
|
|
* 23/7/93 JC
|
|
* - inner loop unrolled with a switch - 25% speed-up!
|
|
* 13/12/93 JC
|
|
* - tiny rounding error removed
|
|
* 7/10/94 JC
|
|
* - new IM_ARRAY() macro
|
|
* - various simplifications
|
|
* - evalend callback added
|
|
* 1/2/95 JC
|
|
* - use of IM_REGION_ADDR() updated
|
|
* - output size was incorrect! see comment below
|
|
* - bug with large non-square matricies fixed too
|
|
* - uses new im_embed() function
|
|
* 13/7/98 JC
|
|
* - wierd bug ... im_free_imask is no longer directly called for close
|
|
* callback, caused SIGKILL on solaris 2.6 ... linker bug?
|
|
* 9/3/01 JC
|
|
* - reworked and simplified, about 10% faster
|
|
* - slightly better range clipping
|
|
* 27/7/01 JC
|
|
* - reject masks with scale == 0
|
|
* 7/4/04
|
|
* - im_conv() now uses im_embed() with edge stretching on the input, not
|
|
* the output
|
|
* - sets Xoffset / Yoffset
|
|
* 11/11/05
|
|
* - simpler inner loop avoids gcc4 bug
|
|
* 7/11/07
|
|
* - new evalstart/end callbacks
|
|
* 12/5/08
|
|
* - int rounding was +1 too much, argh
|
|
* - only rebuild the buffer offsets if bpl changes
|
|
* 5/4/09
|
|
* - tiny speedups and cleanups
|
|
* - add restrict, though it doesn't seem to help gcc
|
|
* 12/11/09
|
|
* - only check for non-zero elements once
|
|
* - add mask-all-zero check
|
|
* - cleanups
|
|
* 3/2/10
|
|
* - gtkdoc
|
|
* - more cleanups
|
|
* 23/08/10
|
|
* - add a special case for 3x3 masks, about 20% faster
|
|
* 1/10/10
|
|
* - support complex (just double the bands)
|
|
* 18/10/10
|
|
* - add experimental Orc path
|
|
* 29/10/10
|
|
* - use VipsVector
|
|
* - get rid of im_convsep(), just call this twice, no longer worth
|
|
* keeping two versions
|
|
* 8/11/10
|
|
* - add array tiling
|
|
* 9/5/11
|
|
* - argh typo in overflow estimation could cause errors
|
|
* 15/10/11 Nicolas
|
|
* - handle offset correctly in seperable convolutions
|
|
* 26/1/16 Lovell Fuller
|
|
* - remove Duff for a 25% speedup
|
|
*/
|
|
|
|
/*
|
|
|
|
This file is part of VIPS.
|
|
|
|
VIPS is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU Lesser General Public License as published by
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
|
|
02110-1301 USA
|
|
|
|
*/
|
|
|
|
/*
|
|
|
|
These files are distributed with VIPS - http://www.vips.ecs.soton.ac.uk
|
|
|
|
*/
|
|
|
|
/* Show sample pixels as they are transformed.
|
|
#define DEBUG_PIXELS
|
|
*/
|
|
|
|
/*
|
|
#define DEBUG
|
|
*/
|
|
|
|
/*
|
|
|
|
TODO
|
|
|
|
- tried 8-bit data with a 32-bit intermediate, but it was only
|
|
slightly faster than C
|
|
|
|
16-bit data would be even slower, no speed advantage
|
|
|
|
- make up a signed 8-bit code path?
|
|
|
|
- don't use divluw, it's insanely slow, instead scale coefficients so
|
|
that we can just do >>8 at the end
|
|
|
|
*/
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
#include <config.h>
|
|
#endif /*HAVE_CONFIG_H*/
|
|
#include <vips/intl.h>
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <limits.h>
|
|
|
|
#include <vips/vips.h>
|
|
#include <vips/vector.h>
|
|
|
|
/* We can't run more than this many passes. Larger than this and we
|
|
* fall back to C.
|
|
*/
|
|
#define MAX_PASS (10)
|
|
|
|
/* A pass with a vector.
|
|
*/
|
|
typedef struct {
|
|
int first; /* The index of the first mask coff we use */
|
|
int last; /* The index of the last mask coff we use */
|
|
|
|
int r; /* Set previous result in this var */
|
|
|
|
/* The code we generate for this section of this mask.
|
|
*/
|
|
VipsVector *vector;
|
|
} Pass;
|
|
|
|
/* Our parameters ... we take a copy of the mask argument, plus we make a
|
|
* smaller version with the zeros squeezed out.
|
|
*/
|
|
typedef struct {
|
|
IMAGE *in;
|
|
IMAGE *out;
|
|
INTMASK *mask; /* Copy of mask arg */
|
|
|
|
int nnz; /* Number of non-zero mask elements */
|
|
int *coeff; /* Array of non-zero mask coefficients */
|
|
int *coeff_pos; /* Index of each nnz element in mask->coeff */
|
|
|
|
int underflow; /* Global underflow/overflow counts */
|
|
int overflow;
|
|
|
|
/* The convolver we generate for this mask. We have to split the
|
|
* convolve and clip into two phases.
|
|
*/
|
|
int n_pass;
|
|
Pass pass[MAX_PASS];
|
|
int s1; /* Input to clip */
|
|
VipsVector *clip;
|
|
} Conv;
|
|
|
|
static void
|
|
conv_vector_free( Conv *conv )
|
|
{
|
|
int i;
|
|
|
|
for( i = 0; i < conv->n_pass; i++ )
|
|
IM_FREEF( vips_vector_free, conv->pass[i].vector );
|
|
conv->n_pass = 0;
|
|
|
|
IM_FREEF( vips_vector_free, conv->clip );
|
|
}
|
|
|
|
static int
|
|
conv_close( Conv *conv )
|
|
{
|
|
IM_FREEF( im_free_imask, conv->mask );
|
|
conv_vector_free( conv );
|
|
|
|
return( 0 );
|
|
}
|
|
|
|
static int
|
|
conv_evalstart( Conv *conv )
|
|
{
|
|
/* Reset underflow/overflow count.
|
|
*
|
|
* This often doesn't get called until eval has already finished, so
|
|
* resetting here just wipes all records.
|
|
*
|
|
conv->overflow = 0;
|
|
conv->underflow = 0;
|
|
*
|
|
*/
|
|
|
|
return( 0 );
|
|
}
|
|
|
|
static int
|
|
conv_evalend( Conv *conv )
|
|
{
|
|
if( conv->overflow )
|
|
vips_info( "im_conv",
|
|
_( "%d overflows detected" ), conv->overflow );
|
|
if( conv->underflow )
|
|
vips_info( "im_conv",
|
|
_( "%d underflows detected" ), conv->underflow );
|
|
|
|
return( 0 );
|
|
}
|
|
|
|
#define TEMP( N, S ) vips_vector_temporary( v, N, S )
|
|
#define SCANLINE( N, P, S ) vips_vector_source_scanline( v, N, P, S )
|
|
#define CONST( N, V, S ) vips_vector_constant( v, N, V, S )
|
|
#define ASM2( OP, A, B ) vips_vector_asm2( v, OP, A, B )
|
|
#define ASM3( OP, A, B, C ) vips_vector_asm3( v, OP, A, B, C )
|
|
|
|
/* Generate code for a section of the mask.
|
|
*
|
|
* 0 for success, -1 on error.
|
|
*/
|
|
static int
|
|
conv_compile_convolution_u8s16_section( Pass *pass,
|
|
Conv *conv, gboolean first_pass )
|
|
{
|
|
INTMASK *mask = conv->mask;
|
|
const int n_mask = mask->xsize * mask->ysize;
|
|
|
|
int i;
|
|
VipsVector *v;
|
|
char zero[256];
|
|
char offset[256];
|
|
char source[256];
|
|
char coeff[256];
|
|
|
|
pass->vector = v = vips_vector_new( "conv", 2 );
|
|
|
|
/* The value we fetch from the image, the product with the matrix
|
|
* value, the accumulated sum.
|
|
*/
|
|
TEMP( "value", 1 );
|
|
TEMP( "product", 2 );
|
|
TEMP( "sum", 2 );
|
|
|
|
/* Init the sum. If this is the first pass, it's a constant. If this
|
|
* is a later pass, we have to init the sum from the result
|
|
* of the previous pass.
|
|
*/
|
|
if( first_pass ) {
|
|
CONST( zero, 0, 2 );
|
|
ASM2( "copyw", "sum", zero );
|
|
}
|
|
else {
|
|
/* "r" is the result of the previous pass.
|
|
*/
|
|
pass->r = vips_vector_source_name( v, "r", 2 );
|
|
ASM2( "loadw", "sum", "r" );
|
|
}
|
|
|
|
for( i = pass->first; i < n_mask; i++ ) {
|
|
int x = i % mask->xsize;
|
|
int y = i / mask->xsize;
|
|
|
|
if( !mask->coeff[i] )
|
|
/* Exclude zero elements.
|
|
*/
|
|
continue;
|
|
|
|
/* The source. sl0 is the first scanline in the mask.
|
|
*/
|
|
SCANLINE( source, y, 1 );
|
|
|
|
/* The offset, only for non-first-columns though.
|
|
*/
|
|
if( x > 0 )
|
|
CONST( offset, conv->in->Bands * x, 1 );
|
|
|
|
/* The coefficient. Only for non-1 coeffs though, we skip the
|
|
* mul for them.
|
|
*
|
|
* We need to do 8-bit unsigned pixel * signed mask, so we
|
|
* have to cast the pixel up to 16-bit then do a mult against a
|
|
* 16-bit constant. We know the result will fit in the bottom
|
|
* 16 bits.
|
|
*/
|
|
if( mask->coeff[i] != 1 )
|
|
CONST( coeff, mask->coeff[i], 2 );
|
|
|
|
/* Two factors:
|
|
* - element is in the first column, ie. has a zero offset
|
|
* - mask coeff is 1, ie. we can skip the multiply
|
|
*
|
|
* We could combine some of these cases, but it's simpler
|
|
* and safer to spell them all out.
|
|
*/
|
|
if( x == 0 )
|
|
ASM2( "loadb", "value", source );
|
|
else
|
|
ASM3( "loadoffb", "value", source, offset );
|
|
|
|
ASM2( "convubw", "product", "value" );
|
|
|
|
if( mask->coeff[i] != 1 )
|
|
ASM3( "mullw", "product", "product", coeff );
|
|
|
|
ASM3( "addssw", "sum", "sum", "product" );
|
|
|
|
if( vips_vector_full( v ) )
|
|
break;
|
|
}
|
|
|
|
pass->last = i;
|
|
|
|
ASM2( "copyw", "d1", "sum" );
|
|
|
|
#ifdef DEBUG
|
|
vips_vector_print( v );
|
|
printf( "compiling ...\n" );
|
|
#endif /*DEBUG*/
|
|
|
|
if( !vips_vector_compile( v ) )
|
|
return( -1 );
|
|
|
|
return( 0 );
|
|
}
|
|
|
|
/* Generate the convolution pass for u8 data with an s16 accumulator.
|
|
*
|
|
* 0 for success, -1 on error.
|
|
*/
|
|
static int
|
|
conv_compile_convolution_u8s16( Conv *conv )
|
|
{
|
|
INTMASK *mask = conv->mask;
|
|
const int n_mask = mask->xsize * mask->ysize;
|
|
|
|
double min, max;
|
|
int i;
|
|
|
|
if( conv->in->BandFmt != IM_BANDFMT_UCHAR )
|
|
return( -1 );
|
|
|
|
/* Can the accumulator overflow or underflow at any stage? Since
|
|
* matrix elements are signed, we need to calculate a running
|
|
* possible min and max.
|
|
*/
|
|
min = 0;
|
|
max = 0;
|
|
for( i = 0; i < n_mask; i++ ) {
|
|
int v = 255 * mask->coeff[i];
|
|
|
|
min = IM_MIN( min, min + v );
|
|
max = IM_MAX( max, max + v );
|
|
|
|
if( max > SHRT_MAX )
|
|
return( -1 );
|
|
if( min < SHRT_MIN )
|
|
return( -1 );
|
|
}
|
|
|
|
/* Generate passes until we've used up the whole mask.
|
|
*/
|
|
for( i = 0;;) {
|
|
Pass *pass;
|
|
|
|
/* Skip any zero coefficients at the start of the mask
|
|
* region.
|
|
*/
|
|
for( ; i < n_mask && !mask->coeff[i]; i++ )
|
|
;
|
|
if( i == n_mask )
|
|
break;
|
|
|
|
/* Allocate space for another pass.
|
|
*/
|
|
if( conv->n_pass == MAX_PASS )
|
|
return( -1 );
|
|
pass = &conv->pass[conv->n_pass];
|
|
conv->n_pass += 1;
|
|
|
|
pass->first = i;
|
|
pass->last = i;
|
|
pass->r = -1;
|
|
|
|
if( conv_compile_convolution_u8s16_section( pass,
|
|
conv, conv->n_pass == 1 ) )
|
|
return( -1 );
|
|
i = pass->last + 1;
|
|
|
|
#ifdef DEBUG
|
|
printf( "conv_compile_convolution_u8s16: "
|
|
"first = %d, last = %d\n",
|
|
pass->first, pass->last );
|
|
#endif /*DEBUG*/
|
|
|
|
if( i >= n_mask )
|
|
break;
|
|
}
|
|
|
|
return( 0 );
|
|
}
|
|
|
|
/* Generate the program that does (pass + rounding) / scale + offset
|
|
* from a s16 intermediate back to a u8 output.
|
|
*/
|
|
static int
|
|
conv_compile_scale_s16u8( Conv *conv )
|
|
{
|
|
INTMASK *mask = conv->mask;
|
|
|
|
VipsVector *v;
|
|
char scale[256];
|
|
char offset[256];
|
|
char zero[256];
|
|
|
|
/* Scale and offset must be in range.
|
|
*/
|
|
if( mask->scale > 255 ||
|
|
mask->scale < 0 ||
|
|
mask->offset > SHRT_MAX ||
|
|
mask->offset < SHRT_MIN )
|
|
return( -1 );
|
|
|
|
conv->clip = v = vips_vector_new( "clip", 1 );
|
|
conv->s1 = vips_vector_source_name( v, "s1", 2 );
|
|
|
|
TEMP( "t1", 2 );
|
|
TEMP( "t2", 2 );
|
|
|
|
/* We can only do unsigned divide, so we must add the offset before
|
|
* dividing by the scale. We need to scale the offset up.
|
|
*
|
|
* We can build the rounding into the offset as well.
|
|
* You might think this should be (scale + 1) / 2, but then we'd be
|
|
* adding one for scale == 1.
|
|
*/
|
|
CONST( scale, mask->scale, 1 );
|
|
CONST( offset, mask->offset * mask->scale + mask->scale / 2, 2 );
|
|
CONST( zero, 0, 2 );
|
|
|
|
/* Offset and scale.
|
|
*/
|
|
ASM3( "addssw", "t1", "s1", offset );
|
|
|
|
/* We need to convert the signed result of the
|
|
* offset to unsigned for the div, ie. we want to set anything <0 to 0.
|
|
*/
|
|
ASM3( "cmpgtsw", "t2", "t1", zero );
|
|
ASM3( "andw", "t1", "t1", "t2" );
|
|
|
|
ASM3( "divluw", "t1", "t1", scale );
|
|
ASM2( "convuuswb", "d1", "t1" );
|
|
|
|
if( !vips_vector_compile( v ) )
|
|
return( -1 );
|
|
|
|
#ifdef DEBUG
|
|
vips_vector_print( v );
|
|
#endif /*DEBUG*/
|
|
|
|
return( 0 );
|
|
}
|
|
|
|
static Conv *
|
|
conv_new( IMAGE *in, IMAGE *out, INTMASK *mask )
|
|
{
|
|
Conv *conv = IM_NEW( out, Conv );
|
|
const int n_mask = mask->xsize * mask->ysize;
|
|
int i;
|
|
|
|
if( !conv )
|
|
return( NULL );
|
|
|
|
conv->in = in;
|
|
conv->out = out;
|
|
conv->mask = NULL;
|
|
conv->nnz = 0;
|
|
conv->coeff = NULL;
|
|
conv->coeff_pos = NULL;
|
|
conv->underflow = 0;
|
|
conv->overflow = 0;
|
|
|
|
conv->n_pass = 0;
|
|
conv->s1 = -1;
|
|
conv->clip = NULL;
|
|
|
|
if( im_add_close_callback( out,
|
|
(im_callback_fn) conv_close, conv, NULL ) ||
|
|
im_add_close_callback( out,
|
|
(im_callback_fn) conv_evalstart, conv, NULL ) ||
|
|
im_add_close_callback( out,
|
|
(im_callback_fn) conv_evalend, conv, NULL ) ||
|
|
!(conv->coeff = IM_ARRAY( out, n_mask, int )) ||
|
|
!(conv->coeff_pos = IM_ARRAY( out, n_mask, int )) ||
|
|
!(conv->mask = im_dup_imask( mask, "conv_mask" )) )
|
|
return( NULL );
|
|
|
|
/* Find non-zero mask elements.
|
|
*/
|
|
for( i = 0; i < n_mask; i++ )
|
|
if( mask->coeff[i] ) {
|
|
conv->coeff[conv->nnz] = mask->coeff[i];
|
|
conv->coeff_pos[conv->nnz] = i;
|
|
conv->nnz += 1;
|
|
}
|
|
|
|
/* Was the whole mask zero? We must have at least 1 element in there:
|
|
* set it to zero.
|
|
*/
|
|
if( conv->nnz == 0 ) {
|
|
conv->coeff[0] = mask->coeff[0];
|
|
conv->coeff_pos[0] = 0;
|
|
conv->nnz = 1;
|
|
}
|
|
|
|
/* Generate code for this mask / image, if possible.
|
|
*/
|
|
if( vips_vector_isenabled() ) {
|
|
if( conv_compile_convolution_u8s16( conv ) ||
|
|
conv_compile_scale_s16u8( conv ) )
|
|
conv_vector_free( conv );
|
|
}
|
|
|
|
return( conv );
|
|
}
|
|
|
|
/* Our sequence value.
|
|
*/
|
|
typedef struct {
|
|
Conv *conv;
|
|
REGION *ir; /* Input region */
|
|
|
|
int *offsets; /* Offsets for each non-zero matrix element */
|
|
VipsPel **pts; /* Per-non-zero mask element pointers */
|
|
|
|
int underflow; /* Underflow/overflow counts */
|
|
int overflow;
|
|
|
|
int last_bpl; /* Avoid recalcing offsets, if we can */
|
|
|
|
/* We need a pair of intermediate buffers to keep the results of each
|
|
* conv pass in.
|
|
*/
|
|
void *t1;
|
|
void *t2;
|
|
} ConvSequence;
|
|
|
|
/* Free a sequence value.
|
|
*/
|
|
static int
|
|
conv_stop( void *vseq, void *a, void *b )
|
|
{
|
|
ConvSequence *seq = (ConvSequence *) vseq;
|
|
Conv *conv = (Conv *) b;
|
|
|
|
/* Add local under/over counts to global counts.
|
|
*/
|
|
conv->overflow += seq->overflow;
|
|
conv->underflow += seq->underflow;
|
|
|
|
IM_FREEF( im_region_free, seq->ir );
|
|
IM_FREE( seq->t1 );
|
|
IM_FREE( seq->t2 );
|
|
|
|
return( 0 );
|
|
}
|
|
|
|
/* Convolution start function.
|
|
*/
|
|
static void *
|
|
conv_start( IMAGE *out, void *a, void *b )
|
|
{
|
|
IMAGE *in = (IMAGE *) a;
|
|
Conv *conv = (Conv *) b;
|
|
|
|
ConvSequence *seq;
|
|
|
|
if( !(seq = IM_NEW( out, ConvSequence )) )
|
|
return( NULL );
|
|
|
|
/* Init!
|
|
*/
|
|
seq->conv = conv;
|
|
seq->ir = NULL;
|
|
seq->pts = NULL;
|
|
seq->underflow = 0;
|
|
seq->overflow = 0;
|
|
seq->last_bpl = -1;
|
|
seq->t1 = NULL;
|
|
seq->t2 = NULL;
|
|
|
|
/* Attach region and arrays.
|
|
*/
|
|
seq->ir = im_region_create( in );
|
|
seq->offsets = IM_ARRAY( out, conv->nnz, int );
|
|
seq->pts = IM_ARRAY( out, conv->nnz, VipsPel * );
|
|
if( !seq->ir || !seq->offsets || !seq->pts ) {
|
|
conv_stop( seq, in, conv );
|
|
return( NULL );
|
|
}
|
|
|
|
if( vips_vector_isenabled() &&
|
|
conv->n_pass ) {
|
|
seq->t1 = IM_ARRAY( NULL, IM_IMAGE_N_ELEMENTS( in ), short );
|
|
seq->t2 = IM_ARRAY( NULL, IM_IMAGE_N_ELEMENTS( in ), short );
|
|
|
|
if( !seq->t1 || !seq->t2 ) {
|
|
conv_stop( seq, in, conv );
|
|
return( NULL );
|
|
}
|
|
}
|
|
|
|
return( seq );
|
|
}
|
|
|
|
/* INT inner loops.
|
|
*/
|
|
#define CONV_INT( TYPE, IM_CLIP ) { \
|
|
TYPE ** restrict p = (TYPE **) seq->pts; \
|
|
TYPE * restrict q = (TYPE *) IM_REGION_ADDR( or, le, y ); \
|
|
\
|
|
for( x = 0; x < sz; x++ ) { \
|
|
int sum; \
|
|
int i; \
|
|
\
|
|
sum = 0; \
|
|
for ( i = 0; i < nnz; i++ ) \
|
|
sum += t[i] * p[i][x]; \
|
|
\
|
|
sum = ((sum + rounding) / mask->scale) + mask->offset; \
|
|
\
|
|
IM_CLIP; \
|
|
\
|
|
q[x] = sum; \
|
|
} \
|
|
}
|
|
|
|
/* FLOAT inner loops.
|
|
*/
|
|
#define CONV_FLOAT( TYPE ) { \
|
|
TYPE ** restrict p = (TYPE **) seq->pts; \
|
|
TYPE * restrict q = (TYPE *) IM_REGION_ADDR( or, le, y ); \
|
|
\
|
|
for( x = 0; x < sz; x++ ) { \
|
|
double sum; \
|
|
int i; \
|
|
\
|
|
sum = 0; \
|
|
for ( i = 0; i < nnz; i++ ) \
|
|
sum += t[i] * p[i][x]; \
|
|
\
|
|
sum = (sum / mask->scale) + mask->offset; \
|
|
\
|
|
q[x] = sum; \
|
|
} \
|
|
}
|
|
|
|
/* Convolve! See below for the special-case 3x3 path.
|
|
*/
|
|
static int
|
|
conv_gen( REGION *or, void *vseq, void *a, void *b )
|
|
{
|
|
ConvSequence *seq = (ConvSequence *) vseq;
|
|
IMAGE *in = (IMAGE *) a;
|
|
Conv *conv = (Conv *) b;
|
|
REGION *ir = seq->ir;
|
|
INTMASK *mask = conv->mask;
|
|
int * restrict t = conv->coeff;
|
|
const int nnz = conv->nnz;
|
|
|
|
/* You might think this should be (scale + 1) / 2, but then we'd be
|
|
* adding one for scale == 1.
|
|
*/
|
|
int rounding = mask->scale / 2;
|
|
|
|
Rect *r = &or->valid;
|
|
Rect s;
|
|
int le = r->left;
|
|
int to = r->top;
|
|
int bo = IM_RECT_BOTTOM( r );
|
|
int sz = IM_REGION_N_ELEMENTS( or ) * (im_iscomplex( in ) ? 2 : 1);
|
|
|
|
int x, y, z, i;
|
|
|
|
/* Prepare the section of the input image we need. A little larger
|
|
* than the section of the output image we are producing.
|
|
*/
|
|
s = *r;
|
|
s.width += mask->xsize - 1;
|
|
s.height += mask->ysize - 1;
|
|
if( im_prepare( ir, &s ) )
|
|
return( -1 );
|
|
|
|
VIPS_GATE_START( "conv_gen: work" );
|
|
|
|
/* Fill offset array. Only do this if the bpl has changed since the
|
|
* previous im_prepare().
|
|
*/
|
|
if( seq->last_bpl != IM_REGION_LSKIP( ir ) ) {
|
|
seq->last_bpl = IM_REGION_LSKIP( ir );
|
|
|
|
for( i = 0; i < nnz; i++ ) {
|
|
z = conv->coeff_pos[i];
|
|
x = z % conv->mask->xsize;
|
|
y = z / conv->mask->xsize;
|
|
|
|
seq->offsets[i] =
|
|
IM_REGION_ADDR( ir, x + le, y + to ) -
|
|
IM_REGION_ADDR( ir, le, to );
|
|
}
|
|
}
|
|
|
|
for( y = to; y < bo; y++ ) {
|
|
/* Init pts for this line of PELs.
|
|
*/
|
|
for( z = 0; z < nnz; z++ )
|
|
seq->pts[z] = seq->offsets[z] +
|
|
IM_REGION_ADDR( ir, le, y );
|
|
|
|
switch( in->BandFmt ) {
|
|
case IM_BANDFMT_UCHAR:
|
|
CONV_INT( unsigned char, IM_CLIP_UCHAR( sum, seq ) );
|
|
break;
|
|
|
|
case IM_BANDFMT_CHAR:
|
|
CONV_INT( signed char, IM_CLIP_CHAR( sum, seq ) );
|
|
break;
|
|
|
|
case IM_BANDFMT_USHORT:
|
|
CONV_INT( unsigned short, IM_CLIP_USHORT( sum, seq ) );
|
|
break;
|
|
|
|
case IM_BANDFMT_SHORT:
|
|
CONV_INT( signed short, IM_CLIP_SHORT( sum, seq ) );
|
|
break;
|
|
|
|
case IM_BANDFMT_UINT:
|
|
CONV_INT( unsigned int, IM_CLIP_NONE( sum, seq ) );
|
|
break;
|
|
|
|
case IM_BANDFMT_INT:
|
|
CONV_INT( signed int, IM_CLIP_NONE( sum, seq ) );
|
|
break;
|
|
|
|
case IM_BANDFMT_FLOAT:
|
|
case IM_BANDFMT_COMPLEX:
|
|
CONV_FLOAT( float );
|
|
break;
|
|
|
|
case IM_BANDFMT_DOUBLE:
|
|
case IM_BANDFMT_DPCOMPLEX:
|
|
CONV_FLOAT( double );
|
|
break;
|
|
|
|
default:
|
|
g_assert_not_reached();
|
|
}
|
|
}
|
|
|
|
VIPS_GATE_STOP( "conv_gen: work" );
|
|
|
|
return( 0 );
|
|
}
|
|
|
|
/* INT inner loops.
|
|
*/
|
|
#define CONV3x3_INT( TYPE, IM_CLIP ) { \
|
|
TYPE * restrict p0 = (TYPE *) IM_REGION_ADDR( ir, le, y ); \
|
|
TYPE * restrict p1 = (TYPE *) IM_REGION_ADDR( ir, le, y + 1 ); \
|
|
TYPE * restrict p2 = (TYPE *) IM_REGION_ADDR( ir, le, y + 2 ); \
|
|
TYPE * restrict q = (TYPE *) IM_REGION_ADDR( or, le, y ); \
|
|
\
|
|
for( x = 0; x < sz; x++ ) { \
|
|
int sum; \
|
|
\
|
|
sum = 0; \
|
|
sum += m[0] * p0[0]; \
|
|
sum += m[1] * p0[bands]; \
|
|
sum += m[2] * p0[bands * 2]; \
|
|
sum += m[3] * p1[0]; \
|
|
sum += m[4] * p1[bands]; \
|
|
sum += m[5] * p1[bands * 2]; \
|
|
sum += m[6] * p2[0]; \
|
|
sum += m[7] * p2[bands]; \
|
|
sum += m[8] * p2[bands * 2]; \
|
|
\
|
|
p0 += 1; \
|
|
p1 += 1; \
|
|
p2 += 1; \
|
|
\
|
|
sum = ((sum + rounding) / mask->scale) + mask->offset; \
|
|
\
|
|
IM_CLIP; \
|
|
\
|
|
q[x] = sum; \
|
|
} \
|
|
}
|
|
|
|
/* FLOAT inner loops.
|
|
*/
|
|
#define CONV3x3_FLOAT( TYPE ) { \
|
|
TYPE * restrict p0 = (TYPE *) IM_REGION_ADDR( ir, le, y ); \
|
|
TYPE * restrict p1 = (TYPE *) IM_REGION_ADDR( ir, le, y + 1 ); \
|
|
TYPE * restrict p2 = (TYPE *) IM_REGION_ADDR( ir, le, y + 2 ); \
|
|
TYPE * restrict q = (TYPE *) IM_REGION_ADDR( or, le, y ); \
|
|
\
|
|
for( x = 0; x < sz; x++ ) { \
|
|
double sum; \
|
|
\
|
|
sum = 0; \
|
|
sum += m[0] * p0[0]; \
|
|
sum += m[1] * p0[bands]; \
|
|
sum += m[2] * p0[bands * 2]; \
|
|
sum += m[3] * p1[0]; \
|
|
sum += m[4] * p1[bands]; \
|
|
sum += m[5] * p1[bands * 2]; \
|
|
sum += m[6] * p2[0]; \
|
|
sum += m[7] * p2[bands]; \
|
|
sum += m[8] * p2[bands * 2]; \
|
|
\
|
|
p0 += 1; \
|
|
p1 += 1; \
|
|
p2 += 1; \
|
|
\
|
|
sum = (sum / mask->scale) + mask->offset; \
|
|
\
|
|
q[x] = sum; \
|
|
} \
|
|
}
|
|
|
|
/* 3x3 masks are very common, so we have a special path for them. This is
|
|
* about 20% faster than the general convolver above.
|
|
*/
|
|
static int
|
|
conv3x3_gen( REGION *or, void *vseq, void *a, void *b )
|
|
{
|
|
ConvSequence *seq = (ConvSequence *) vseq;
|
|
IMAGE *in = (IMAGE *) a;
|
|
Conv *conv = (Conv *) b;
|
|
REGION *ir = seq->ir;
|
|
INTMASK *mask = conv->mask;
|
|
int * restrict m = mask->coeff;
|
|
|
|
/* You might think this should be (scale + 1) / 2, but then we'd be
|
|
* adding one for scale == 1.
|
|
*/
|
|
int rounding = mask->scale / 2;
|
|
|
|
Rect *r = &or->valid;
|
|
int le = r->left;
|
|
int to = r->top;
|
|
int bo = IM_RECT_BOTTOM( r );
|
|
int sz = IM_REGION_N_ELEMENTS( or ) * (im_iscomplex( in ) ? 2 : 1);
|
|
int bands = in->Bands;
|
|
|
|
Rect s;
|
|
int x, y;
|
|
|
|
/* Prepare the section of the input image we need. A little larger
|
|
* than the section of the output image we are producing.
|
|
*/
|
|
s = *r;
|
|
s.width += 2;
|
|
s.height += 2;
|
|
if( im_prepare( ir, &s ) )
|
|
return( -1 );
|
|
|
|
VIPS_GATE_START( "conv3x3_gen: work" );
|
|
|
|
for( y = to; y < bo; y++ ) {
|
|
switch( in->BandFmt ) {
|
|
case IM_BANDFMT_UCHAR:
|
|
CONV3x3_INT( unsigned char,
|
|
IM_CLIP_UCHAR( sum, seq ) );
|
|
break;
|
|
|
|
case IM_BANDFMT_CHAR:
|
|
CONV3x3_INT( signed char,
|
|
IM_CLIP_CHAR( sum, seq ) );
|
|
break;
|
|
|
|
case IM_BANDFMT_USHORT:
|
|
CONV3x3_INT( unsigned short,
|
|
IM_CLIP_USHORT( sum, seq ) );
|
|
break;
|
|
|
|
case IM_BANDFMT_SHORT:
|
|
CONV3x3_INT( signed short,
|
|
IM_CLIP_SHORT( sum, seq ) );
|
|
break;
|
|
|
|
case IM_BANDFMT_UINT:
|
|
CONV3x3_INT( unsigned int,
|
|
IM_CLIP_NONE( sum, seq ) );
|
|
break;
|
|
|
|
case IM_BANDFMT_INT:
|
|
CONV3x3_INT( signed int,
|
|
IM_CLIP_NONE( sum, seq ) );
|
|
break;
|
|
|
|
case IM_BANDFMT_FLOAT:
|
|
case IM_BANDFMT_COMPLEX:
|
|
CONV3x3_FLOAT( float );
|
|
break;
|
|
|
|
case IM_BANDFMT_DOUBLE:
|
|
case IM_BANDFMT_DPCOMPLEX:
|
|
CONV3x3_FLOAT( double );
|
|
break;
|
|
|
|
default:
|
|
g_assert_not_reached();
|
|
}
|
|
}
|
|
|
|
VIPS_GATE_STOP( "conv3x3_gen: work" );
|
|
|
|
return( 0 );
|
|
}
|
|
|
|
/* The VipsVector codepath.
|
|
*/
|
|
static int
|
|
convvec_gen( REGION *or, void *vseq, void *a, void *b )
|
|
{
|
|
ConvSequence *seq = (ConvSequence *) vseq;
|
|
IMAGE *in = (IMAGE *) a;
|
|
Conv *conv = (Conv *) b;
|
|
INTMASK *mask = conv->mask;
|
|
REGION *ir = seq->ir;
|
|
|
|
Rect *r = &or->valid;
|
|
int sz = IM_REGION_N_ELEMENTS( or ) * (im_iscomplex( in ) ? 2 : 1);
|
|
|
|
Rect s;
|
|
int j, y;
|
|
VipsExecutor convolve[MAX_PASS];
|
|
VipsExecutor clip;
|
|
|
|
/* Prepare the section of the input image we need. A little larger
|
|
* than the section of the output image we are producing.
|
|
*/
|
|
s = *r;
|
|
s.width += mask->xsize - 1;
|
|
s.height += mask->ysize - 1;
|
|
if( im_prepare( ir, &s ) )
|
|
return( -1 );
|
|
|
|
VIPS_GATE_START( "convvec_gen: work" );
|
|
|
|
for( j = 0; j < conv->n_pass; j++ )
|
|
vips_executor_set_program( &convolve[j],
|
|
conv->pass[j].vector, sz );
|
|
vips_executor_set_program( &clip, conv->clip, sz );
|
|
|
|
for( y = 0; y < r->height; y++ ) {
|
|
#ifdef DEBUG_PIXELS
|
|
{
|
|
int h, v;
|
|
|
|
printf( "before convolve: %d, %d\n", r->left, r->top + y );
|
|
for( v = 0; v < mask->ysize; v++ ) {
|
|
for( h = 0; h < mask->xsize; h++ )
|
|
printf( "%3d ", *IM_REGION_ADDR( ir,
|
|
r->left + h, r->top + y + v ) );
|
|
printf( "\n" );
|
|
}
|
|
}
|
|
#endif /*DEBUG_PIXELS*/
|
|
|
|
for( j = 0; j < conv->n_pass; j++ ) {
|
|
/* We always read from t1 and write to t2.
|
|
*/
|
|
vips_executor_set_scanline( &convolve[j],
|
|
ir, r->left, r->top + y );
|
|
vips_executor_set_array( &convolve[j],
|
|
conv->pass[j].r, seq->t1 );
|
|
vips_executor_set_destination( &convolve[j], seq->t2 );
|
|
vips_executor_run( &convolve[j] );
|
|
|
|
IM_SWAP( void *, seq->t1, seq->t2 );
|
|
}
|
|
|
|
#ifdef DEBUG_PIXELS
|
|
printf( "before clip: %d\n", ((signed short *) seq->t1)[0] );
|
|
#endif /*DEBUG_PIXELS*/
|
|
|
|
vips_executor_set_array( &clip, conv->s1, seq->t1 );
|
|
vips_executor_set_destination( &clip,
|
|
IM_REGION_ADDR( or, r->left, r->top + y ) );
|
|
vips_executor_run( &clip );
|
|
|
|
#ifdef DEBUG_PIXELS
|
|
printf( "after clip: %d\n",
|
|
*IM_REGION_ADDR( or, r->left, r->top + y ) );
|
|
#endif /*DEBUG_PIXELS*/
|
|
}
|
|
|
|
VIPS_GATE_STOP( "convvec_gen: work" );
|
|
|
|
return( 0 );
|
|
}
|
|
|
|
int
|
|
im_conv_raw( IMAGE *in, IMAGE *out, INTMASK *mask )
|
|
{
|
|
Conv *conv;
|
|
im_generate_fn generate;
|
|
|
|
#ifdef DEBUG
|
|
printf( "im_conv_raw: starting with matrix:\n" );
|
|
im_print_imask( mask );
|
|
#endif /*DEBUG*/
|
|
|
|
/* Check parameters.
|
|
*/
|
|
if( im_piocheck( in, out ) ||
|
|
im_check_uncoded( "im_conv", in ) ||
|
|
im_check_imask( "im_conv", mask ) )
|
|
return( -1 );
|
|
if( mask->scale == 0 ) {
|
|
im_error( "im_conv", "%s", "mask scale must be non-zero" );
|
|
return( -1 );
|
|
}
|
|
if( !(conv = conv_new( in, out, mask )) )
|
|
return( -1 );
|
|
|
|
/* Prepare output. Consider a 7x7 mask and a 7x7 image --- the output
|
|
* would be 1x1.
|
|
*/
|
|
if( im_cp_desc( out, in ) )
|
|
return( -1 );
|
|
out->Xsize -= mask->xsize - 1;
|
|
out->Ysize -= mask->ysize - 1;
|
|
if( out->Xsize <= 0 || out->Ysize <= 0 ) {
|
|
im_error( "im_conv", "%s", _( "image too small for mask" ) );
|
|
return( -1 );
|
|
}
|
|
|
|
if( conv->n_pass ) {
|
|
generate = convvec_gen;
|
|
|
|
#ifdef DEBUG
|
|
printf( "im_conv_raw: using vector path\n" );
|
|
#endif /*DEBUG*/
|
|
}
|
|
else if( mask->xsize == 3 && mask->ysize == 3 ) {
|
|
generate = conv3x3_gen;
|
|
|
|
#ifdef DEBUG
|
|
printf( "im_conv_raw: using 3x3 path\n" );
|
|
#endif /*DEBUG*/
|
|
}
|
|
else {
|
|
generate = conv_gen;
|
|
|
|
#ifdef DEBUG
|
|
printf( "im_conv_raw: using general path\n" );
|
|
#endif /*DEBUG*/
|
|
}
|
|
|
|
if( im_demand_hint( out, IM_SMALLTILE, in, NULL ) ||
|
|
im_generate( out, conv_start, generate, conv_stop, in, conv ) )
|
|
return( -1 );
|
|
|
|
out->Xoffset = -mask->xsize / 2;
|
|
out->Yoffset = -mask->ysize / 2;
|
|
|
|
return( 0 );
|
|
}
|
|
|
|
int
|
|
im_conv( IMAGE *in, IMAGE *out, INTMASK *mask )
|
|
{
|
|
IMAGE *t1 = im_open_local( out, "im_conv intermediate", "p" );
|
|
|
|
if( !t1 ||
|
|
im_embed( in, t1, 1, mask->xsize / 2, mask->ysize / 2,
|
|
in->Xsize + mask->xsize - 1,
|
|
in->Ysize + mask->ysize - 1 ) ||
|
|
im_conv_raw( t1, out, mask ) )
|
|
return( -1 );
|
|
|
|
out->Xoffset = 0;
|
|
out->Yoffset = 0;
|
|
|
|
return( 0 );
|
|
}
|