finish hl3 version

vl3 nest
This commit is contained in:
John Cupitt 2016-03-10 08:43:50 +00:00
parent a26291a426
commit 013102de01
2 changed files with 215 additions and 9 deletions

View File

@ -138,7 +138,7 @@ reduceh_signed_int_tab( VipsPel *pout, const VipsPel *pin,
}
}
/* Floating-point version, for int/float types.
/* Floating-point version.
*/
template <typename T>
static void inline

View File

@ -2,6 +2,8 @@
*
* 29/1/16
* - from shrinkh.c
* 10/3/16
* - add other kernels
*/
/*
@ -156,6 +158,56 @@ vips_reducehl3_make_mask( VipsKernel kernel, double x, double *c )
}
}
/* A 4-point interpolation on uint8 is the most common case ... unroll that.
*
* The inner loop here won't vectorise, but our inner loop doesn't run for
* long enough for vectorisation to be useful :-( gcc says it needs about an
* 11-point kernel for the vector version to be worthwhile.
*/
static void inline
reducehl3_unsigned_uint8_4tab( VipsPel *out, const VipsPel *in,
const int bands, const int *cx )
{
const int b1 = bands;
const int b2 = b1 + b1;
const int b3 = b1 + b2;
const int c0 = cx[0];
const int c1 = cx[1];
const int c2 = cx[2];
const int c3 = cx[3];
for( int z = 0; z < bands; z++ ) {
int cubich = unsigned_fixed_round(
c0 * in[0] +
c1 * in[b1] +
c2 * in[b2] +
c3 * in[b3] );
cubich = VIPS_CLIP( 0, cubich, 255 );
out[z] = cubich;
in += 1;
}
}
/* Our inner loop. Operate on elements of size T, gather results in an
* intermediate of type IT.
*/
template <typename T, typename IT>
static IT
reducehl3_sum( const T * restrict in, int bands, const IT * restrict c, int n )
{
IT sum;
sum = 0;
for( int i = 0; i < n; i++ )
sum += c[i] * in[i * bands];
return( sum );
}
template <typename T, int max_value>
static void inline
reducehl3_unsigned_int_tab( VipsReducehl3 *reducehl3,
@ -167,13 +219,9 @@ reducehl3_unsigned_int_tab( VipsReducehl3 *reducehl3,
for( int z = 0; z < bands; z++ ) {
int sum;
sum = 0;
for( int i = 0; i < reducehl3->n_points; i++ )
sum += cx[i] * in[i * bands];
sum = reducehl3_sum<T, int>(in, bands, cx, reducehl3->n_points);
sum = unsigned_fixed_round( sum );
sum = VIPS_CLIP( 0, sum, max_value );
out[z] = sum;
@ -182,6 +230,113 @@ reducehl3_unsigned_int_tab( VipsReducehl3 *reducehl3,
}
}
template <typename T, int min_value, int max_value>
static void inline
reducehl3_signed_int_tab( VipsReducehl3 *reducehl3,
VipsPel *pout, const VipsPel *pin,
const int bands, const int * restrict cx )
{
T* restrict out = (T *) pout;
const T* restrict in = (T *) pin;
for( int z = 0; z < bands; z++ ) {
int sum;
sum = reducehl3_sum<T, int>(in, bands, cx, reducehl3->n_points);
sum = signed_fixed_round( sum );
sum = VIPS_CLIP( min_value, sum, max_value );
out[z] = sum;
in += 1;
}
}
/* Floating-point version.
*/
template <typename T>
static void inline
reducehl3_float_tab( VipsReducehl3 *reducehl3,
VipsPel *pout, const VipsPel *pin,
const int bands, const double *cx )
{
T* restrict out = (T *) pout;
const T* restrict in = (T *) pin;
for( int z = 0; z < bands; z++ ) {
out[z] = reducehl3_sum<T, double>
(in, bands, cx, reducehl3->n_points);
in += 1;
}
}
/* 32-bit int output needs a double intermediate.
*/
template <typename T, int max_value>
static void inline
reducehl3_unsigned_int32_tab( VipsReducehl3 *reducehl3,
VipsPel *pout, const VipsPel *pin,
const int bands, const double * restrict cx )
{
T* restrict out = (T *) pout;
const T* restrict in = (T *) pin;
for( int z = 0; z < bands; z++ ) {
double sum;
sum = reducehl3_sum<T, double>
(in, bands, cx, reducehl3->n_points);
out[z] = VIPS_CLIP( 0, sum, max_value );
in += 1;
}
}
template <typename T, int min_value, int max_value>
static void inline
reducehl3_signed_int32_tab( VipsReducehl3 *reducehl3,
VipsPel *pout, const VipsPel *pin,
const int bands, const double * restrict cx )
{
T* restrict out = (T *) pout;
const T* restrict in = (T *) pin;
for( int z = 0; z < bands; z++ ) {
double sum;
sum = reducehl3_sum<T, double>
(in, bands, cx, reducehl3->n_points);
sum = VIPS_CLIP( min_value, sum, max_value );
out[z] = sum;
in += 1;
}
}
/* Ultra-high-quality version for double images.
*/
template <typename T>
static void inline
reducehl3_notab( VipsReducehl3 *reducehl3,
VipsPel *pout, const VipsPel *pin,
const int bands, double x )
{
T* restrict out = (T *) pout;
const T* restrict in = (T *) pin;
double cx[MAX_POINTS];
vips_reducehl3_make_mask( reducehl3->kernel, x, cx );
for( int z = 0; z < bands; z++ ) {
out[z] = reducehl3_sum<T, double>
(in, bands, cx, reducehl3->n_points);
in += 1;
}
}
static int
vips_reducehl3_gen( VipsRegion *out_region, void *seq,
void *a, void *b, gboolean *stop )
@ -231,12 +386,63 @@ vips_reducehl3_gen( VipsRegion *out_region, void *seq,
switch( in->BandFmt ) {
case VIPS_FORMAT_UCHAR:
reducehl3_unsigned_int_tab
<unsigned char, UCHAR_MAX>(
if( reducehl3->n_points == 4 )
reducehl3_unsigned_uint8_4tab(
q, p, bands, cxi );
else
reducehl3_unsigned_int_tab
<unsigned char, UCHAR_MAX>(
reducehl3,
q, p, bands, cxi );
break;
case VIPS_FORMAT_CHAR:
reducehl3_signed_int_tab
<signed char, SCHAR_MIN, SCHAR_MAX>(
reducehl3,
q, p, bands, cxi );
break;
case VIPS_FORMAT_USHORT:
reducehl3_unsigned_int_tab
<unsigned short, USHRT_MAX>(
reducehl3,
q, p, bands, cxi );
break;
case VIPS_FORMAT_SHORT:
reducehl3_signed_int_tab
<signed short, SHRT_MIN, SHRT_MAX>(
reducehl3,
q, p, bands, cxi );
break;
case VIPS_FORMAT_UINT:
reducehl3_unsigned_int32_tab
<unsigned int, INT_MAX>(
reducehl3,
q, p, bands, cxf );
break;
case VIPS_FORMAT_INT:
reducehl3_signed_int32_tab
<signed int, INT_MIN, INT_MAX>(
reducehl3,
q, p, bands, cxf );
break;
case VIPS_FORMAT_FLOAT:
case VIPS_FORMAT_COMPLEX:
reducehl3_float_tab<float>( reducehl3,
q, p, bands, cxf );
break;
case VIPS_FORMAT_DOUBLE:
case VIPS_FORMAT_DPCOMPLEX:
reducehl3_notab<double>( reducehl3,
q, p, bands, X - ix );
break;
default:
g_assert_not_reached();
break;