finish vl3

but it's still not producing identical results to reducev, seems to be
offset by one?
This commit is contained in:
John Cupitt 2016-03-10 12:16:21 +00:00
parent 4d10bd12f9
commit e9bf936377
2 changed files with 213 additions and 38 deletions

View File

@ -154,6 +154,29 @@ vips_reduce_make_mask( VipsKernel kernel, double x, double *c )
}
}
template <typename T, int max_value>
static void inline
reducehl3_unsigned_int_tab( VipsReducehl3 *reducehl3,
VipsPel *pout, const VipsPel *pin,
const int bands, const int * restrict cx )
{
T* restrict out = (T *) pout;
const T* restrict in = (T *) pin;
const int n = reducehl3->n_points;
for( int z = 0; z < bands; z++ ) {
int sum;
sum = reduce_sum<T, int>( in, bands, cx, n );
sum = unsigned_fixed_round( sum );
sum = VIPS_CLIP( 0, sum, max_value );
out[z] = sum;
in += 1;
}
}
/* A 4-point interpolation on uint8 is the most common case ... unroll that.
*
* The inner loop here won't vectorise, but our inner loop doesn't run for
@ -188,29 +211,6 @@ reducehl3_unsigned_uint8_4tab( VipsPel *out, const VipsPel *in,
}
}
template <typename T, int max_value>
static void inline
reducehl3_unsigned_int_tab( VipsReducehl3 *reducehl3,
VipsPel *pout, const VipsPel *pin,
const int bands, const int * restrict cx )
{
T* restrict out = (T *) pout;
const T* restrict in = (T *) pin;
const int n = reducehl3->n_points;
for( int z = 0; z < bands; z++ ) {
int sum;
sum = reduce_sum<T, int>( in, bands, cx, n );
sum = unsigned_fixed_round( sum );
sum = VIPS_CLIP( 0, sum, max_value );
out[z] = sum;
in += 1;
}
}
template <typename T, int min_value, int max_value>
static void inline
reducehl3_signed_int_tab( VipsReducehl3 *reducehl3,

View File

@ -87,38 +87,162 @@ extern "C" {
G_DEFINE_TYPE( VipsReducevl3, vips_reducevl3, VIPS_TYPE_RESAMPLE );
}
/* You'd think this would vectorise, but gcc hates mixed types in nested loops
* :-(
*/
template <typename T, int max_value>
static void inline
reducevl3_unsigned_int_tab( VipsReducevl3 *reducevl3,
VipsPel *pout, const VipsPel *pin,
const int ne, const int lskip,
const int * restrict cy )
const int ne, const int lskip, const int * restrict cy )
{
T* restrict out = (T *) pout;
const T* restrict in = (T *) pin;
const int n = reducevl3->n_points;
const int l1 = lskip / sizeof( T );
const int round_by = VIPS_INTERPOLATE_SCALE >> 1;
for( int z = 0; z < ne; z++ ) {
int sum;
sum = 0;
for( int i = 0; i < n; i++ )
sum += cy[i] * in[z + i * l1];
sum = reduce_sum<T, int>( in + z, l1, cy, n );
sum = unsigned_fixed_round( sum );
sum = VIPS_CLIP( 0, sum, max_value );
sum = (sum + round_by) >> VIPS_INTERPOLATE_SHIFT;
out[z] = sum;
}
}
//sum = reduce_sum<T, int>( in, l1, cy, n );
//sum = unsigned_fixed_round( sum );
//sum = VIPS_CLIP( 0, sum, max_value );
/* An unrolled version of ^^ for the most common case.
*/
static void inline
reducevl3_unsigned_uint8_4tab( VipsPel *out, const VipsPel *in,
const int ne, const int lskip, const int *cy )
{
const int l1 = lskip;
const int l2 = l1 + l1;
const int l3 = l1 + l2;
const int c0 = cy[0];
const int c1 = cy[1];
const int c2 = cy[2];
const int c3 = cy[3];
for( int z = 0; z < ne; z++ ) {
int sum = unsigned_fixed_round(
c0 * in[0] +
c1 * in[l1] +
c2 * in[l2] +
c3 * in[l3] );
sum = VIPS_CLIP( 0, sum, 255 );
out[z] = sum;
//in += 1;
in += 1;
}
}
template <typename T, int min_value, int max_value>
static void inline
reducevl3_signed_int_tab( VipsReducevl3 *reducevl3,
VipsPel *pout, const VipsPel *pin,
const int ne, const int lskip, const int * restrict cy )
{
T* restrict out = (T *) pout;
const T* restrict in = (T *) pin;
const int n = reducevl3->n_points;
const int l1 = lskip / sizeof( T );
for( int z = 0; z < ne; z++ ) {
int sum;
sum = reduce_sum<T, int>( in + z, l1, cy, n );
sum = signed_fixed_round( sum );
sum = VIPS_CLIP( min_value, sum, max_value );
out[z] = sum;
}
}
/* Floating-point version.
*/
template <typename T>
static void inline
reducevl3_float_tab( VipsReducevl3 *reducevl3,
VipsPel *pout, const VipsPel *pin,
const int ne, const int lskip, const double * restrict cy )
{
T* restrict out = (T *) pout;
const T* restrict in = (T *) pin;
const int n = reducevl3->n_points;
const int l1 = lskip / sizeof( T );
for( int z = 0; z < ne; z++ )
out[z] = reduce_sum<T, double>( in + z, l1, cy, n );
}
/* 32-bit int output needs a double intermediate.
*/
template <typename T, int max_value>
static void inline
reducevl3_unsigned_int32_tab( VipsReducevl3 *reducevl3,
VipsPel *pout, const VipsPel *pin,
const int ne, const int lskip, const double * restrict cy )
{
T* restrict out = (T *) pout;
const T* restrict in = (T *) pin;
const int n = reducevl3->n_points;
const int l1 = lskip / sizeof( T );
for( int z = 0; z < ne; z++ ) {
double sum;
sum = reduce_sum<T, double>( in + z, l1, cy, n );
out[z] = VIPS_CLIP( 0, sum, max_value );
}
}
template <typename T, int min_value, int max_value>
static void inline
reducevl3_signed_int32_tab( VipsReducevl3 *reducevl3,
VipsPel *pout, const VipsPel *pin,
const int ne, const int lskip, const double * restrict cy )
{
T* restrict out = (T *) pout;
const T* restrict in = (T *) pin;
const int n = reducevl3->n_points;
const int l1 = lskip / sizeof( T );
for( int z = 0; z < ne; z++ ) {
double sum;
sum = reduce_sum<T, double>( in + z, l1, cy, n );
out[z] = VIPS_CLIP( min_value, sum, max_value );
}
}
/* Ultra-high-quality version for double images.
*/
template <typename T>
static void inline
reducevl3_notab( VipsReducevl3 *reducevl3,
VipsPel *pout, const VipsPel *pin,
const int ne, const int lskip, double y )
{
T* restrict out = (T *) pout;
const T* restrict in = (T *) pin;
const int n = reducevl3->n_points;
const int l1 = lskip / sizeof( T );
double cy[MAX_POINTS];
vips_reduce_make_mask( reducevl3->kernel, y, cy );
for( int z = 0; z < ne; z++ )
out[z] = reduce_sum<T, double>( in + z, l1, cy, n );
}
static int
vips_reducevl3_gen( VipsRegion *out_region, void *seq,
void *a, void *b, gboolean *stop )
@ -151,7 +275,8 @@ vips_reducevl3_gen( VipsRegion *out_region, void *seq,
VIPS_GATE_START( "vips_reducevl3_gen: work" );
for( int y = 0; y < r->height; y ++ ) {
VipsPel *q = VIPS_REGION_ADDR( out_region, r->left, r->top + y );
VipsPel *q =
VIPS_REGION_ADDR( out_region, r->left, r->top + y );
const double Y = (r->top + y) * reducevl3->yshrink;
VipsPel *p = VIPS_REGION_ADDR( ir, r->left, (int) Y );
const int sy = Y * VIPS_TRANSFORM_SCALE * 2;
@ -163,12 +288,63 @@ vips_reducevl3_gen( VipsRegion *out_region, void *seq,
switch( in->BandFmt ) {
case VIPS_FORMAT_UCHAR:
reducevl3_unsigned_int_tab
<unsigned char, UCHAR_MAX>(
if( reducevl3->n_points == 4 )
reducevl3_unsigned_uint8_4tab(
q, p, ne, lskip, cyi );
else
reducevl3_unsigned_int_tab
<unsigned char, UCHAR_MAX>(
reducevl3,
q, p, ne, lskip, cyi );
break;
case VIPS_FORMAT_CHAR:
reducevl3_signed_int_tab
<signed char, SCHAR_MIN, SCHAR_MAX>(
reducevl3,
q, p, ne, lskip, cyi );
break;
case VIPS_FORMAT_USHORT:
reducevl3_unsigned_int_tab
<unsigned short, USHRT_MAX>(
reducevl3,
q, p, ne, lskip, cyi );
break;
case VIPS_FORMAT_SHORT:
reducevl3_signed_int_tab
<signed short, SHRT_MIN, SHRT_MAX>(
reducevl3,
q, p, ne, lskip, cyi );
break;
case VIPS_FORMAT_UINT:
reducevl3_unsigned_int32_tab
<unsigned int, INT_MAX>(
reducevl3,
q, p, ne, lskip, cyf );
break;
case VIPS_FORMAT_INT:
reducevl3_signed_int32_tab
<signed int, INT_MIN, INT_MAX>(
reducevl3,
q, p, ne, lskip, cyf );
break;
case VIPS_FORMAT_FLOAT:
case VIPS_FORMAT_COMPLEX:
reducevl3_float_tab<float>( reducevl3,
q, p, ne, lskip, cyf );
break;
case VIPS_FORMAT_DPCOMPLEX:
case VIPS_FORMAT_DOUBLE:
reducevl3_notab<double>( reducevl3,
q, p, ne, lskip, Y - (int) Y );
break;
default:
g_assert_not_reached();
break;
@ -300,7 +476,6 @@ vips_reducevl3_class_init( VipsReducevl3Class *reducevl3_class )
G_STRUCT_OFFSET( VipsReducevl3, kernel ),
VIPS_TYPE_KERNEL, VIPS_KERNEL_CUBIC );
}
static void