still trying to get reducevl3 to vectorise

This commit is contained in:
John Cupitt 2016-03-10 10:24:44 +00:00
parent f58d941d99
commit 4d10bd12f9
4 changed files with 107 additions and 76 deletions

View File

@ -65,8 +65,12 @@ typedef struct _VipsResampleClass {
GType vips_resample_get_type( void );
int vips_reducehl3_get_points( VipsKernel kernel );
void vips_reducehl3_make_mask( VipsKernel kernel, double x, double *c );
/* The max size of the vector we use.
*/
#define MAX_POINTS (6)
int vips_reduce_get_points( VipsKernel kernel );
void vips_reduce_make_mask( VipsKernel kernel, double x, double *c );
#ifdef __cplusplus
}

View File

@ -64,10 +64,6 @@
* 1D resampling kernels.
*/
/* The max size of the vector we use.
*/
#define MAX_POINTS (6)
typedef struct _VipsReducehl3 {
VipsResample parent_instance;
@ -101,7 +97,7 @@ G_DEFINE_TYPE( VipsReducehl3, vips_reducehl3, VIPS_TYPE_RESAMPLE );
/* Get n points.
*/
int
vips_reducehl3_get_points( VipsKernel kernel )
vips_reduce_get_points( VipsKernel kernel )
{
switch( kernel ) {
case VIPS_KERNEL_NEAREST:
@ -128,7 +124,7 @@ vips_reducehl3_get_points( VipsKernel kernel )
/* Calculate a mask.
*/
void
vips_reducehl3_make_mask( VipsKernel kernel, double x, double *c )
vips_reduce_make_mask( VipsKernel kernel, double x, double *c )
{
switch( kernel ) {
case VIPS_KERNEL_NEAREST:
@ -136,8 +132,8 @@ vips_reducehl3_make_mask( VipsKernel kernel, double x, double *c )
break;
case VIPS_KERNEL_LINEAR:
c[0] = x;
c[1] = 1.0 - x;
c[0] = 1.0 - x;
c[1] = x;
break;
case VIPS_KERNEL_CUBIC:
@ -192,22 +188,6 @@ reducehl3_unsigned_uint8_4tab( VipsPel *out, const VipsPel *in,
}
}
/* Our inner loop. Operate on elements of size T, gather results in an
* intermediate of type IT.
*/
template <typename T, typename IT>
static IT
reducehl3_sum( const T * restrict in, int bands, const IT * restrict c, int n )
{
IT sum;
sum = 0;
for( int i = 0; i < n; i++ )
sum += c[i] * in[i * bands];
return( sum );
}
template <typename T, int max_value>
static void inline
reducehl3_unsigned_int_tab( VipsReducehl3 *reducehl3,
@ -216,11 +196,12 @@ reducehl3_unsigned_int_tab( VipsReducehl3 *reducehl3,
{
T* restrict out = (T *) pout;
const T* restrict in = (T *) pin;
const int n = reducehl3->n_points;
for( int z = 0; z < bands; z++ ) {
int sum;
sum = reducehl3_sum<T, int>(in, bands, cx, reducehl3->n_points);
sum = reduce_sum<T, int>( in, bands, cx, n );
sum = unsigned_fixed_round( sum );
sum = VIPS_CLIP( 0, sum, max_value );
@ -238,11 +219,12 @@ reducehl3_signed_int_tab( VipsReducehl3 *reducehl3,
{
T* restrict out = (T *) pout;
const T* restrict in = (T *) pin;
const int n = reducehl3->n_points;
for( int z = 0; z < bands; z++ ) {
int sum;
sum = reducehl3_sum<T, int>(in, bands, cx, reducehl3->n_points);
sum = reduce_sum<T, int>( in, bands, cx, n );
sum = signed_fixed_round( sum );
sum = VIPS_CLIP( min_value, sum, max_value );
@ -262,10 +244,10 @@ reducehl3_float_tab( VipsReducehl3 *reducehl3,
{
T* restrict out = (T *) pout;
const T* restrict in = (T *) pin;
const int n = reducehl3->n_points;
for( int z = 0; z < bands; z++ ) {
out[z] = reducehl3_sum<T, double>
(in, bands, cx, reducehl3->n_points);
out[z] = reduce_sum<T, double>( in, bands, cx, n );
in += 1;
}
}
@ -281,12 +263,12 @@ reducehl3_unsigned_int32_tab( VipsReducehl3 *reducehl3,
{
T* restrict out = (T *) pout;
const T* restrict in = (T *) pin;
const int n = reducehl3->n_points;
for( int z = 0; z < bands; z++ ) {
double sum;
sum = reducehl3_sum<T, double>
(in, bands, cx, reducehl3->n_points);
sum = reduce_sum<T, double>( in, bands, cx, n );
out[z] = VIPS_CLIP( 0, sum, max_value );
in += 1;
@ -301,12 +283,12 @@ reducehl3_signed_int32_tab( VipsReducehl3 *reducehl3,
{
T* restrict out = (T *) pout;
const T* restrict in = (T *) pin;
const int n = reducehl3->n_points;
for( int z = 0; z < bands; z++ ) {
double sum;
sum = reducehl3_sum<T, double>
(in, bands, cx, reducehl3->n_points);
sum = reduce_sum<T, double>( in, bands, cx, n );
sum = VIPS_CLIP( min_value, sum, max_value );
out[z] = sum;
@ -324,14 +306,14 @@ reducehl3_notab( VipsReducehl3 *reducehl3,
{
T* restrict out = (T *) pout;
const T* restrict in = (T *) pin;
const int n = reducehl3->n_points;
double cx[MAX_POINTS];
vips_reducehl3_make_mask( reducehl3->kernel, x, cx );
vips_reduce_make_mask( reducehl3->kernel, x, cx );
for( int z = 0; z < bands; z++ ) {
out[z] = reducehl3_sum<T, double>
(in, bands, cx, reducehl3->n_points);
out[z] = reduce_sum<T, double>( in, bands, cx, n );
in += 1;
}
@ -488,9 +470,9 @@ vips_reducehl3_build( VipsObject *object )
/* Build the tables of pre-computed coefficients.
*/
reducehl3->n_points = vips_reducehl3_get_points( reducehl3->kernel );
reducehl3->n_points = vips_reduce_get_points( reducehl3->kernel );
for( int x = 0; x < VIPS_TRANSFORM_SCALE + 1; x++ ) {
vips_reducehl3_make_mask( reducehl3->kernel,
vips_reduce_make_mask( reducehl3->kernel,
(float) x / VIPS_TRANSFORM_SCALE,
reducehl3->matrixf[x] );
@ -575,7 +557,7 @@ vips_reducehl3_class_init( VipsReducehl3Class *reducehl3_class )
VIPS_ARG_ENUM( reducehl3_class, "kernel", 3,
_( "Kernel" ),
_( "Resamling kernel" ),
_( "Resampling kernel" ),
VIPS_ARGUMENT_OPTIONAL_INPUT,
G_STRUCT_OFFSET( VipsReducehl3, kernel ),
VIPS_TYPE_KERNEL, VIPS_KERNEL_CUBIC );

View File

@ -2,6 +2,8 @@
*
* 29/1/16
* - from shrinkv.c
* 10/3/16
* - add other kernels
*/
/*
@ -51,25 +53,34 @@
#include "presample.h"
#include "templates.h"
/* The max size of the vector we use.
*/
#define MAX_POINTS (6)
typedef struct _VipsReducevl3 {
VipsResample parent_instance;
double yshrink; /* Shrink factor */
/* The thing we use to make the kernel.
*/
VipsKernel kernel;
/* Number of points in kernel.
*/
int n_points;
/* Precalculated interpolation matrices. int (used for pel
* sizes up to short), and double (for all others). We go to
* scale + 1 so we can round-to-nearest safely.
*/
int matrixi[VIPS_TRANSFORM_SCALE + 1][MAX_POINTS];
double matrixf[VIPS_TRANSFORM_SCALE + 1][MAX_POINTS];
} VipsReducevl3;
typedef VipsResampleClass VipsReducevl3Class;
/* Precalculated interpolation matrices. int (used for pel
* sizes up to short), and double (for all others). We go to
* scale + 1 so we can round-to-nearest safely.
*/
const int n_points = 6;
static int vips_reducevl3_matrixi[VIPS_TRANSFORM_SCALE + 1][n_points];
static double vips_reducevl3_matrixf[VIPS_TRANSFORM_SCALE + 1][n_points];
/* We need C linkage for this.
*/
extern "C" {
@ -78,29 +89,33 @@ G_DEFINE_TYPE( VipsReducevl3, vips_reducevl3, VIPS_TYPE_RESAMPLE );
template <typename T, int max_value>
static void inline
reducevl3_unsigned_int_tab( VipsPel *pout, const VipsPel *pin,
reducevl3_unsigned_int_tab( VipsReducevl3 *reducevl3,
VipsPel *pout, const VipsPel *pin,
const int ne, const int lskip,
const int * restrict cy )
{
T* restrict out = (T *) pout;
const T* restrict in = (T *) pin;
const int n = reducevl3->n_points;
const int l1 = lskip / sizeof( T );
const int round_by = VIPS_INTERPOLATE_SCALE >> 1;
for( int z = 0; z < ne; z++ ) {
int sum;
sum = 0;
for( int i = 0; i < n_points; i++ )
sum += cy[i] * in[i * l1];
sum = 0;
for( int i = 0; i < n; i++ )
sum += cy[i] * in[z + i * l1];
sum = unsigned_fixed_round( sum );
sum = (sum + round_by) >> VIPS_INTERPOLATE_SHIFT;
sum = VIPS_CLIP( 0, sum, max_value );
//sum = reduce_sum<T, int>( in, l1, cy, n );
//sum = unsigned_fixed_round( sum );
//sum = VIPS_CLIP( 0, sum, max_value );
out[z] = sum;
in += 1;
//in += 1;
}
}
@ -129,7 +144,7 @@ vips_reducevl3_gen( VipsRegion *out_region, void *seq,
s.left = r->left;
s.top = r->top * reducevl3->yshrink;
s.width = r->width;
s.height = r->height * reducevl3->yshrink + n_points;
s.height = r->height * reducevl3->yshrink + reducevl3->n_points;
if( vips_region_prepare( ir, &s ) )
return( -1 );
@ -142,14 +157,15 @@ vips_reducevl3_gen( VipsRegion *out_region, void *seq,
const int sy = Y * VIPS_TRANSFORM_SCALE * 2;
const int siy = sy & (VIPS_TRANSFORM_SCALE * 2 - 1);
const int ty = (siy + 1) >> 1;
const int *cyi = vips_reducevl3_matrixi[ty];
const double *cyf = vips_reducevl3_matrixf[ty];
const int *cyi = reducevl3->matrixi[ty];
const double *cyf = reducevl3->matrixf[ty];
const int lskip = VIPS_REGION_LSKIP( ir );
switch( in->BandFmt ) {
case VIPS_FORMAT_UCHAR:
reducevl3_unsigned_int_tab
<unsigned char, UCHAR_MAX>(
reducevl3,
q, p, ne, lskip, cyi );
break;
@ -191,6 +207,19 @@ vips_reducevl3_build( VipsObject *object )
if( reducevl3->yshrink == 1 )
return( vips_image_write( in, resample->out ) );
/* Build the tables of pre-computed coefficients.
*/
reducevl3->n_points = vips_reduce_get_points( reducevl3->kernel );
for( int y = 0; y < VIPS_TRANSFORM_SCALE + 1; y++ ) {
vips_reduce_make_mask( reducevl3->kernel,
(float) y / VIPS_TRANSFORM_SCALE,
reducevl3->matrixf[y] );
for( int i = 0; i < reducevl3->n_points; i++ )
reducevl3->matrixi[y][i] = reducevl3->matrixf[y][i] *
VIPS_INTERPOLATE_SCALE;
}
/* Unpack for processing.
*/
if( vips_image_decode( in, &t[0] ) )
@ -200,8 +229,8 @@ vips_reducevl3_build( VipsObject *object )
/* Add new pixels around the input so we can interpolate at the edges.
*/
if( vips_embed( in, &t[1],
0, n_points / 2,
in->Xsize, in->Ysize + n_points - 1,
0, reducevl3->n_points / 2,
in->Xsize, in->Ysize + reducevl3->n_points - 1,
"extend", VIPS_EXTEND_COPY,
NULL ) )
return( -1 );
@ -217,7 +246,7 @@ vips_reducevl3_build( VipsObject *object )
* example, vipsthumbnail knows the true reduce factor (including the
* fractional part), we just see the integer part here.
*/
resample->out->Ysize = (in->Ysize - n_points + 1) / reducevl3->yshrink;
resample->out->Ysize = (in->Ysize - reducevl3->n_points + 1) / reducevl3->yshrink;
if( resample->out->Ysize <= 0 ) {
vips_error( object_class->nickname,
"%s", _( "image has shrunk to nothing" ) );
@ -264,24 +293,20 @@ vips_reducevl3_class_init( VipsReducevl3Class *reducevl3_class )
G_STRUCT_OFFSET( VipsReducevl3, yshrink ),
1, 1000000, 1 );
/* Build the tables of pre-computed coefficients.
*/
for( int y = 0; y < VIPS_TRANSFORM_SCALE + 1; y++ ) {
calculate_coefficients_lanczos( 3,
(float) y / VIPS_TRANSFORM_SCALE,
vips_reducevl3_matrixf[y] );
VIPS_ARG_ENUM( reducevl3_class, "kernel", 3,
_( "Kernel" ),
_( "Resampling kernel" ),
VIPS_ARGUMENT_OPTIONAL_INPUT,
G_STRUCT_OFFSET( VipsReducevl3, kernel ),
VIPS_TYPE_KERNEL, VIPS_KERNEL_CUBIC );
for( int i = 0; i < n_points; i++ )
vips_reducevl3_matrixi[y][i] =
vips_reducevl3_matrixf[y][i] *
VIPS_INTERPOLATE_SCALE;
}
}
static void
vips_reducevl3_init( VipsReducevl3 *reducevl3 )
{
reducevl3->kernel = VIPS_KERNEL_CUBIC;
}
/**
@ -291,8 +316,12 @@ vips_reducevl3_init( VipsReducevl3 *reducevl3 )
* @yshrink: horizontal reduce
* @...: %NULL-terminated list of optional named arguments
*
* Optional arguments:
*
* @kernel: #VipsKernel to use to interpolate (default: cubic)
*
* Reduce @in vertically by a float factor. The pixels in @out are
* interpolated with a 1D cubic mask. This operation will not work well for
* interpolated with a 1D mask. This operation will not work well for
* a reduction of more than a factor of two.
*
* This is a very low-level operation: see vips_resize() for a more

View File

@ -339,3 +339,19 @@ calculate_coefficients_lanczos( int a, const double x, double *c )
c[i] = l;
}
}
/* Our inner loop for resampling with a convolution. Operate on elements of
* size T, gather results in an intermediate of type IT.
*/
template <typename T, typename IT>
static IT
reduce_sum( const T * restrict in, int stride, const IT * restrict c, int n )
{
IT sum;
sum = 0;
for( int i = 0; i < n; i++ )
sum += c[i] * in[i * stride];
return( sum );
}