still trying to get reducevl3 to vectorise

This commit is contained in:
John Cupitt 2016-03-10 10:24:44 +00:00
parent f58d941d99
commit 4d10bd12f9
4 changed files with 107 additions and 76 deletions

View File

@ -65,8 +65,12 @@ typedef struct _VipsResampleClass {
GType vips_resample_get_type( void ); GType vips_resample_get_type( void );
int vips_reducehl3_get_points( VipsKernel kernel ); /* The max size of the vector we use.
void vips_reducehl3_make_mask( VipsKernel kernel, double x, double *c ); */
#define MAX_POINTS (6)
int vips_reduce_get_points( VipsKernel kernel );
void vips_reduce_make_mask( VipsKernel kernel, double x, double *c );
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -64,10 +64,6 @@
* 1D resampling kernels. * 1D resampling kernels.
*/ */
/* The max size of the vector we use.
*/
#define MAX_POINTS (6)
typedef struct _VipsReducehl3 { typedef struct _VipsReducehl3 {
VipsResample parent_instance; VipsResample parent_instance;
@ -101,7 +97,7 @@ G_DEFINE_TYPE( VipsReducehl3, vips_reducehl3, VIPS_TYPE_RESAMPLE );
/* Get n points. /* Get n points.
*/ */
int int
vips_reducehl3_get_points( VipsKernel kernel ) vips_reduce_get_points( VipsKernel kernel )
{ {
switch( kernel ) { switch( kernel ) {
case VIPS_KERNEL_NEAREST: case VIPS_KERNEL_NEAREST:
@ -128,7 +124,7 @@ vips_reducehl3_get_points( VipsKernel kernel )
/* Calculate a mask. /* Calculate a mask.
*/ */
void void
vips_reducehl3_make_mask( VipsKernel kernel, double x, double *c ) vips_reduce_make_mask( VipsKernel kernel, double x, double *c )
{ {
switch( kernel ) { switch( kernel ) {
case VIPS_KERNEL_NEAREST: case VIPS_KERNEL_NEAREST:
@ -136,8 +132,8 @@ vips_reducehl3_make_mask( VipsKernel kernel, double x, double *c )
break; break;
case VIPS_KERNEL_LINEAR: case VIPS_KERNEL_LINEAR:
c[0] = x; c[0] = 1.0 - x;
c[1] = 1.0 - x; c[1] = x;
break; break;
case VIPS_KERNEL_CUBIC: case VIPS_KERNEL_CUBIC:
@ -192,22 +188,6 @@ reducehl3_unsigned_uint8_4tab( VipsPel *out, const VipsPel *in,
} }
} }
/* Our inner loop. Operate on elements of size T, gather results in an
* intermediate of type IT.
*/
template <typename T, typename IT>
static IT
reducehl3_sum( const T * restrict in, int bands, const IT * restrict c, int n )
{
IT sum;
sum = 0;
for( int i = 0; i < n; i++ )
sum += c[i] * in[i * bands];
return( sum );
}
template <typename T, int max_value> template <typename T, int max_value>
static void inline static void inline
reducehl3_unsigned_int_tab( VipsReducehl3 *reducehl3, reducehl3_unsigned_int_tab( VipsReducehl3 *reducehl3,
@ -216,11 +196,12 @@ reducehl3_unsigned_int_tab( VipsReducehl3 *reducehl3,
{ {
T* restrict out = (T *) pout; T* restrict out = (T *) pout;
const T* restrict in = (T *) pin; const T* restrict in = (T *) pin;
const int n = reducehl3->n_points;
for( int z = 0; z < bands; z++ ) { for( int z = 0; z < bands; z++ ) {
int sum; int sum;
sum = reducehl3_sum<T, int>(in, bands, cx, reducehl3->n_points); sum = reduce_sum<T, int>( in, bands, cx, n );
sum = unsigned_fixed_round( sum ); sum = unsigned_fixed_round( sum );
sum = VIPS_CLIP( 0, sum, max_value ); sum = VIPS_CLIP( 0, sum, max_value );
@ -238,11 +219,12 @@ reducehl3_signed_int_tab( VipsReducehl3 *reducehl3,
{ {
T* restrict out = (T *) pout; T* restrict out = (T *) pout;
const T* restrict in = (T *) pin; const T* restrict in = (T *) pin;
const int n = reducehl3->n_points;
for( int z = 0; z < bands; z++ ) { for( int z = 0; z < bands; z++ ) {
int sum; int sum;
sum = reducehl3_sum<T, int>(in, bands, cx, reducehl3->n_points); sum = reduce_sum<T, int>( in, bands, cx, n );
sum = signed_fixed_round( sum ); sum = signed_fixed_round( sum );
sum = VIPS_CLIP( min_value, sum, max_value ); sum = VIPS_CLIP( min_value, sum, max_value );
@ -262,10 +244,10 @@ reducehl3_float_tab( VipsReducehl3 *reducehl3,
{ {
T* restrict out = (T *) pout; T* restrict out = (T *) pout;
const T* restrict in = (T *) pin; const T* restrict in = (T *) pin;
const int n = reducehl3->n_points;
for( int z = 0; z < bands; z++ ) { for( int z = 0; z < bands; z++ ) {
out[z] = reducehl3_sum<T, double> out[z] = reduce_sum<T, double>( in, bands, cx, n );
(in, bands, cx, reducehl3->n_points);
in += 1; in += 1;
} }
} }
@ -281,12 +263,12 @@ reducehl3_unsigned_int32_tab( VipsReducehl3 *reducehl3,
{ {
T* restrict out = (T *) pout; T* restrict out = (T *) pout;
const T* restrict in = (T *) pin; const T* restrict in = (T *) pin;
const int n = reducehl3->n_points;
for( int z = 0; z < bands; z++ ) { for( int z = 0; z < bands; z++ ) {
double sum; double sum;
sum = reducehl3_sum<T, double> sum = reduce_sum<T, double>( in, bands, cx, n );
(in, bands, cx, reducehl3->n_points);
out[z] = VIPS_CLIP( 0, sum, max_value ); out[z] = VIPS_CLIP( 0, sum, max_value );
in += 1; in += 1;
@ -301,12 +283,12 @@ reducehl3_signed_int32_tab( VipsReducehl3 *reducehl3,
{ {
T* restrict out = (T *) pout; T* restrict out = (T *) pout;
const T* restrict in = (T *) pin; const T* restrict in = (T *) pin;
const int n = reducehl3->n_points;
for( int z = 0; z < bands; z++ ) { for( int z = 0; z < bands; z++ ) {
double sum; double sum;
sum = reducehl3_sum<T, double> sum = reduce_sum<T, double>( in, bands, cx, n );
(in, bands, cx, reducehl3->n_points);
sum = VIPS_CLIP( min_value, sum, max_value ); sum = VIPS_CLIP( min_value, sum, max_value );
out[z] = sum; out[z] = sum;
@ -324,14 +306,14 @@ reducehl3_notab( VipsReducehl3 *reducehl3,
{ {
T* restrict out = (T *) pout; T* restrict out = (T *) pout;
const T* restrict in = (T *) pin; const T* restrict in = (T *) pin;
const int n = reducehl3->n_points;
double cx[MAX_POINTS]; double cx[MAX_POINTS];
vips_reducehl3_make_mask( reducehl3->kernel, x, cx ); vips_reduce_make_mask( reducehl3->kernel, x, cx );
for( int z = 0; z < bands; z++ ) { for( int z = 0; z < bands; z++ ) {
out[z] = reducehl3_sum<T, double> out[z] = reduce_sum<T, double>( in, bands, cx, n );
(in, bands, cx, reducehl3->n_points);
in += 1; in += 1;
} }
@ -488,9 +470,9 @@ vips_reducehl3_build( VipsObject *object )
/* Build the tables of pre-computed coefficients. /* Build the tables of pre-computed coefficients.
*/ */
reducehl3->n_points = vips_reducehl3_get_points( reducehl3->kernel ); reducehl3->n_points = vips_reduce_get_points( reducehl3->kernel );
for( int x = 0; x < VIPS_TRANSFORM_SCALE + 1; x++ ) { for( int x = 0; x < VIPS_TRANSFORM_SCALE + 1; x++ ) {
vips_reducehl3_make_mask( reducehl3->kernel, vips_reduce_make_mask( reducehl3->kernel,
(float) x / VIPS_TRANSFORM_SCALE, (float) x / VIPS_TRANSFORM_SCALE,
reducehl3->matrixf[x] ); reducehl3->matrixf[x] );
@ -575,7 +557,7 @@ vips_reducehl3_class_init( VipsReducehl3Class *reducehl3_class )
VIPS_ARG_ENUM( reducehl3_class, "kernel", 3, VIPS_ARG_ENUM( reducehl3_class, "kernel", 3,
_( "Kernel" ), _( "Kernel" ),
_( "Resamling kernel" ), _( "Resampling kernel" ),
VIPS_ARGUMENT_OPTIONAL_INPUT, VIPS_ARGUMENT_OPTIONAL_INPUT,
G_STRUCT_OFFSET( VipsReducehl3, kernel ), G_STRUCT_OFFSET( VipsReducehl3, kernel ),
VIPS_TYPE_KERNEL, VIPS_KERNEL_CUBIC ); VIPS_TYPE_KERNEL, VIPS_KERNEL_CUBIC );

View File

@ -2,6 +2,8 @@
* *
* 29/1/16 * 29/1/16
* - from shrinkv.c * - from shrinkv.c
* 10/3/16
* - add other kernels
*/ */
/* /*
@ -51,25 +53,34 @@
#include "presample.h" #include "presample.h"
#include "templates.h" #include "templates.h"
/* The max size of the vector we use.
*/
#define MAX_POINTS (6)
typedef struct _VipsReducevl3 { typedef struct _VipsReducevl3 {
VipsResample parent_instance; VipsResample parent_instance;
double yshrink; /* Shrink factor */ double yshrink; /* Shrink factor */
/* The thing we use to make the kernel.
*/
VipsKernel kernel;
/* Number of points in kernel.
*/
int n_points;
/* Precalculated interpolation matrices. int (used for pel
* sizes up to short), and double (for all others). We go to
* scale + 1 so we can round-to-nearest safely.
*/
int matrixi[VIPS_TRANSFORM_SCALE + 1][MAX_POINTS];
double matrixf[VIPS_TRANSFORM_SCALE + 1][MAX_POINTS];
} VipsReducevl3; } VipsReducevl3;
typedef VipsResampleClass VipsReducevl3Class; typedef VipsResampleClass VipsReducevl3Class;
/* Precalculated interpolation matrices. int (used for pel
* sizes up to short), and double (for all others). We go to
* scale + 1 so we can round-to-nearest safely.
*/
const int n_points = 6;
static int vips_reducevl3_matrixi[VIPS_TRANSFORM_SCALE + 1][n_points];
static double vips_reducevl3_matrixf[VIPS_TRANSFORM_SCALE + 1][n_points];
/* We need C linkage for this. /* We need C linkage for this.
*/ */
extern "C" { extern "C" {
@ -78,29 +89,33 @@ G_DEFINE_TYPE( VipsReducevl3, vips_reducevl3, VIPS_TYPE_RESAMPLE );
template <typename T, int max_value> template <typename T, int max_value>
static void inline static void inline
reducevl3_unsigned_int_tab( VipsPel *pout, const VipsPel *pin, reducevl3_unsigned_int_tab( VipsReducevl3 *reducevl3,
VipsPel *pout, const VipsPel *pin,
const int ne, const int lskip, const int ne, const int lskip,
const int * restrict cy ) const int * restrict cy )
{ {
T* restrict out = (T *) pout; T* restrict out = (T *) pout;
const T* restrict in = (T *) pin; const T* restrict in = (T *) pin;
const int n = reducevl3->n_points;
const int l1 = lskip / sizeof( T ); const int l1 = lskip / sizeof( T );
const int round_by = VIPS_INTERPOLATE_SCALE >> 1;
for( int z = 0; z < ne; z++ ) { for( int z = 0; z < ne; z++ ) {
int sum; int sum;
sum = 0; sum = 0;
for( int i = 0; i < n_points; i++ ) for( int i = 0; i < n; i++ )
sum += cy[i] * in[i * l1]; sum += cy[i] * in[z + i * l1];
sum = unsigned_fixed_round( sum ); sum = (sum + round_by) >> VIPS_INTERPOLATE_SHIFT;
sum = VIPS_CLIP( 0, sum, max_value ); //sum = reduce_sum<T, int>( in, l1, cy, n );
//sum = unsigned_fixed_round( sum );
//sum = VIPS_CLIP( 0, sum, max_value );
out[z] = sum; out[z] = sum;
in += 1; //in += 1;
} }
} }
@ -129,7 +144,7 @@ vips_reducevl3_gen( VipsRegion *out_region, void *seq,
s.left = r->left; s.left = r->left;
s.top = r->top * reducevl3->yshrink; s.top = r->top * reducevl3->yshrink;
s.width = r->width; s.width = r->width;
s.height = r->height * reducevl3->yshrink + n_points; s.height = r->height * reducevl3->yshrink + reducevl3->n_points;
if( vips_region_prepare( ir, &s ) ) if( vips_region_prepare( ir, &s ) )
return( -1 ); return( -1 );
@ -142,14 +157,15 @@ vips_reducevl3_gen( VipsRegion *out_region, void *seq,
const int sy = Y * VIPS_TRANSFORM_SCALE * 2; const int sy = Y * VIPS_TRANSFORM_SCALE * 2;
const int siy = sy & (VIPS_TRANSFORM_SCALE * 2 - 1); const int siy = sy & (VIPS_TRANSFORM_SCALE * 2 - 1);
const int ty = (siy + 1) >> 1; const int ty = (siy + 1) >> 1;
const int *cyi = vips_reducevl3_matrixi[ty]; const int *cyi = reducevl3->matrixi[ty];
const double *cyf = vips_reducevl3_matrixf[ty]; const double *cyf = reducevl3->matrixf[ty];
const int lskip = VIPS_REGION_LSKIP( ir ); const int lskip = VIPS_REGION_LSKIP( ir );
switch( in->BandFmt ) { switch( in->BandFmt ) {
case VIPS_FORMAT_UCHAR: case VIPS_FORMAT_UCHAR:
reducevl3_unsigned_int_tab reducevl3_unsigned_int_tab
<unsigned char, UCHAR_MAX>( <unsigned char, UCHAR_MAX>(
reducevl3,
q, p, ne, lskip, cyi ); q, p, ne, lskip, cyi );
break; break;
@ -191,6 +207,19 @@ vips_reducevl3_build( VipsObject *object )
if( reducevl3->yshrink == 1 ) if( reducevl3->yshrink == 1 )
return( vips_image_write( in, resample->out ) ); return( vips_image_write( in, resample->out ) );
/* Build the tables of pre-computed coefficients.
*/
reducevl3->n_points = vips_reduce_get_points( reducevl3->kernel );
for( int y = 0; y < VIPS_TRANSFORM_SCALE + 1; y++ ) {
vips_reduce_make_mask( reducevl3->kernel,
(float) y / VIPS_TRANSFORM_SCALE,
reducevl3->matrixf[y] );
for( int i = 0; i < reducevl3->n_points; i++ )
reducevl3->matrixi[y][i] = reducevl3->matrixf[y][i] *
VIPS_INTERPOLATE_SCALE;
}
/* Unpack for processing. /* Unpack for processing.
*/ */
if( vips_image_decode( in, &t[0] ) ) if( vips_image_decode( in, &t[0] ) )
@ -200,8 +229,8 @@ vips_reducevl3_build( VipsObject *object )
/* Add new pixels around the input so we can interpolate at the edges. /* Add new pixels around the input so we can interpolate at the edges.
*/ */
if( vips_embed( in, &t[1], if( vips_embed( in, &t[1],
0, n_points / 2, 0, reducevl3->n_points / 2,
in->Xsize, in->Ysize + n_points - 1, in->Xsize, in->Ysize + reducevl3->n_points - 1,
"extend", VIPS_EXTEND_COPY, "extend", VIPS_EXTEND_COPY,
NULL ) ) NULL ) )
return( -1 ); return( -1 );
@ -217,7 +246,7 @@ vips_reducevl3_build( VipsObject *object )
* example, vipsthumbnail knows the true reduce factor (including the * example, vipsthumbnail knows the true reduce factor (including the
* fractional part), we just see the integer part here. * fractional part), we just see the integer part here.
*/ */
resample->out->Ysize = (in->Ysize - n_points + 1) / reducevl3->yshrink; resample->out->Ysize = (in->Ysize - reducevl3->n_points + 1) / reducevl3->yshrink;
if( resample->out->Ysize <= 0 ) { if( resample->out->Ysize <= 0 ) {
vips_error( object_class->nickname, vips_error( object_class->nickname,
"%s", _( "image has shrunk to nothing" ) ); "%s", _( "image has shrunk to nothing" ) );
@ -264,24 +293,20 @@ vips_reducevl3_class_init( VipsReducevl3Class *reducevl3_class )
G_STRUCT_OFFSET( VipsReducevl3, yshrink ), G_STRUCT_OFFSET( VipsReducevl3, yshrink ),
1, 1000000, 1 ); 1, 1000000, 1 );
/* Build the tables of pre-computed coefficients. VIPS_ARG_ENUM( reducevl3_class, "kernel", 3,
*/ _( "Kernel" ),
for( int y = 0; y < VIPS_TRANSFORM_SCALE + 1; y++ ) { _( "Resampling kernel" ),
calculate_coefficients_lanczos( 3, VIPS_ARGUMENT_OPTIONAL_INPUT,
(float) y / VIPS_TRANSFORM_SCALE, G_STRUCT_OFFSET( VipsReducevl3, kernel ),
vips_reducevl3_matrixf[y] ); VIPS_TYPE_KERNEL, VIPS_KERNEL_CUBIC );
for( int i = 0; i < n_points; i++ )
vips_reducevl3_matrixi[y][i] =
vips_reducevl3_matrixf[y][i] *
VIPS_INTERPOLATE_SCALE;
}
} }
static void static void
vips_reducevl3_init( VipsReducevl3 *reducevl3 ) vips_reducevl3_init( VipsReducevl3 *reducevl3 )
{ {
reducevl3->kernel = VIPS_KERNEL_CUBIC;
} }
/** /**
@ -291,8 +316,12 @@ vips_reducevl3_init( VipsReducevl3 *reducevl3 )
* @yshrink: horizontal reduce * @yshrink: horizontal reduce
* @...: %NULL-terminated list of optional named arguments * @...: %NULL-terminated list of optional named arguments
* *
* Optional arguments:
*
* @kernel: #VipsKernel to use to interpolate (default: cubic)
*
* Reduce @in vertically by a float factor. The pixels in @out are * Reduce @in vertically by a float factor. The pixels in @out are
* interpolated with a 1D cubic mask. This operation will not work well for * interpolated with a 1D mask. This operation will not work well for
* a reduction of more than a factor of two. * a reduction of more than a factor of two.
* *
* This is a very low-level operation: see vips_resize() for a more * This is a very low-level operation: see vips_resize() for a more

View File

@ -339,3 +339,19 @@ calculate_coefficients_lanczos( int a, const double x, double *c )
c[i] = l; c[i] = l;
} }
} }
/* Our inner loop for resampling with a convolution. Operate on elements of
* size T, gather results in an intermediate of type IT.
*/
template <typename T, typename IT>
static IT
reduce_sum( const T * restrict in, int stride, const IT * restrict c, int n )
{
IT sum;
sum = 0;
for( int i = 0; i < n; i++ )
sum += c[i] * in[i * stride];
return( sum );
}