more im_aconv() speedups
now uses rolling boxes for the vertical pass too
This commit is contained in:
parent
4e3e0cee6c
commit
953a315755
14
TODO
14
TODO
@ -1,17 +1,3 @@
|
||||
- read_header() from im_read_dmask() annoys valgrind
|
||||
|
||||
==14466== Use of uninitialised value of size 8
|
||||
==14466== at 0xA0BE850: __strspn_sse42 (emmintrin.h:685)
|
||||
==14466== by 0x4F815AB: vips_break_token (util.c:373)
|
||||
==14466== by 0x4F87F68: read_header (rw_mask.c:412)
|
||||
==14466== by 0x4F88198: im_read_dmask (rw_mask.c:488)
|
||||
==14466== by 0x4F01C1F: dmask_init (dispatch_types.c:263)
|
||||
==14466== by 0x4F03AA2: build_args (package.c:956)
|
||||
==14466== by 0x4F03EBC: dispatch_function (package.c:1068)
|
||||
==14466== by 0x4F0403C: im_run_command (package.c:1133)
|
||||
==14466== by 0x403997: main (vips.c:1074)
|
||||
|
||||
|
||||
|
||||
|
||||
- revisit orc conv
|
||||
|
@ -53,29 +53,20 @@
|
||||
timing:
|
||||
|
||||
$ time vips im_conv_f img_0075.jpg x2.v g2d201.con
|
||||
real 11m58.769s
|
||||
user 22m46.390s
|
||||
sys 0m3.270s
|
||||
real 5m3.359s
|
||||
user 9m34.700s
|
||||
sys 0m1.500s
|
||||
|
||||
$ time vips im_aconv img_0075.jpg x.v g2d201.con 10 10
|
||||
boxes_new: min = 0, max = 1
|
||||
boxes_new: depth = 0.1, n_layers = 10
|
||||
boxes_new: generated 1130 boxes
|
||||
boxes_new: clustering with thresh 10 ...
|
||||
boxes_new: renumbering ...
|
||||
boxes_new: after renumbering, 14 boxes remain
|
||||
real 0m34.377s
|
||||
user 1m0.440s
|
||||
sys 0m0.370s
|
||||
real 0m4.877s
|
||||
user 0m7.490s
|
||||
sys 0m0.220s
|
||||
|
||||
$ vips im_subtract x.v x2.v diff.v
|
||||
$ vips im_abs diff.v abs.v
|
||||
$ vips im_max abs.v
|
||||
2.70833
|
||||
|
||||
- can we use rolling averages for the vertical pass?
|
||||
we need to search for groups with the same band and adjacent row
|
||||
|
||||
- clustering could be much faster
|
||||
|
||||
- add more bandfmt
|
||||
@ -85,9 +76,9 @@ $ vips im_max abs.v
|
||||
*/
|
||||
|
||||
/*
|
||||
*/
|
||||
#define DEBUG
|
||||
#define VIPS_DEBUG
|
||||
*/
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include <config.h>
|
||||
@ -319,12 +310,6 @@ boxes_renumber( Boxes *boxes )
|
||||
{
|
||||
int i, j;
|
||||
|
||||
j = 0;
|
||||
for( i = 0; i < boxes->n_hline; i++ )
|
||||
if( boxes->hline[i].weight == 0 )
|
||||
j++;
|
||||
printf( "%d weight 0 hlines\n", j );
|
||||
|
||||
/* Loop for all zero-weight hlines.
|
||||
*/
|
||||
for( i = 0; i < boxes->n_hline; ) {
|
||||
@ -560,8 +545,10 @@ boxes_new( IMAGE *in, IMAGE *out, DOUBLEMASK *mask, int n_layers, int cluster )
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
VIPS_DEBUG_MSG( "boxes_new: generated %d boxes\n", boxes->n_hline );
|
||||
boxes_print( boxes );
|
||||
#endif /*DEBUG*/
|
||||
|
||||
VIPS_DEBUG_MSG( "boxes_new: clustering with thresh %d ...\n",
|
||||
cluster );
|
||||
@ -628,18 +615,17 @@ typedef struct {
|
||||
|
||||
REGION *ir; /* Input region */
|
||||
|
||||
/* For the horizontal pass, offsets for start and stop. For the
|
||||
* vertical pass, just use just start to get the offsets to sum.
|
||||
/* Offsets for start and stop.
|
||||
*/
|
||||
int *start;
|
||||
int *end;
|
||||
|
||||
/* For the horizontal pass, the rolling sums. int for integer types,
|
||||
int last_stride; /* Avoid recalcing offsets, if we can */
|
||||
|
||||
/* The rolling sums. int for integer types,
|
||||
* double for floating point types.
|
||||
*/
|
||||
void *sum;
|
||||
|
||||
int last_stride; /* Avoid recalcing offsets, if we can */
|
||||
} AConvSequence;
|
||||
|
||||
/* Free a sequence value.
|
||||
@ -675,6 +661,7 @@ aconv_start( IMAGE *out, void *a, void *b )
|
||||
/* n_velement should be the largest possible dimension.
|
||||
*/
|
||||
g_assert( boxes->n_velement >= boxes->n_hline );
|
||||
g_assert( boxes->n_velement >= boxes->n_vline );
|
||||
|
||||
seq->start = IM_ARRAY( out, boxes->n_velement, int );
|
||||
seq->end = IM_ARRAY( out, boxes->n_velement, int );
|
||||
@ -904,7 +891,7 @@ aconv_vgenerate( REGION *or, void *vseq, void *a, void *b )
|
||||
Boxes *boxes = (Boxes *) b;
|
||||
|
||||
REGION *ir = seq->ir;
|
||||
const int n_velement = boxes->n_velement;
|
||||
const int n_vline = boxes->n_vline;
|
||||
DOUBLEMASK *mask = boxes->mask;
|
||||
Rect *r = &or->valid;
|
||||
|
||||
@ -914,7 +901,7 @@ aconv_vgenerate( REGION *or, void *vseq, void *a, void *b )
|
||||
2 * IM_REGION_N_ELEMENTS( or ) : IM_REGION_N_ELEMENTS( or );
|
||||
|
||||
Rect s;
|
||||
int x, y, z;
|
||||
int x, y, z, k;
|
||||
int istride;
|
||||
int ostride;
|
||||
|
||||
@ -936,15 +923,20 @@ aconv_vgenerate( REGION *or, void *vseq, void *a, void *b )
|
||||
if( seq->last_stride != istride ) {
|
||||
seq->last_stride = istride;
|
||||
|
||||
for( z = 0; z < n_velement; z++ )
|
||||
seq->start[z] = boxes->velement[z].band +
|
||||
boxes->velement[z].row * istride;
|
||||
for( z = 0; z < n_vline; z++ ) {
|
||||
seq->start[z] = boxes->vline[z].band +
|
||||
boxes->vline[z].start * istride;
|
||||
seq->end[z] = boxes->vline[z].band +
|
||||
boxes->vline[z].end * istride;
|
||||
}
|
||||
}
|
||||
|
||||
switch( boxes->in->BandFmt ) {
|
||||
case IM_BANDFMT_UCHAR:
|
||||
|
||||
for( x = 0; x < sz; x++ ) {
|
||||
int *seq_sum = (int *) seq->sum;
|
||||
|
||||
int *p;
|
||||
PEL *q;
|
||||
int sum;
|
||||
@ -953,18 +945,34 @@ aconv_vgenerate( REGION *or, void *vseq, void *a, void *b )
|
||||
(int *) IM_REGION_ADDR( ir, r->left, r->top );
|
||||
q = x + (PEL *) IM_REGION_ADDR( or, r->left, r->top );
|
||||
|
||||
for( y = 0; y < r->height; y++ ) {
|
||||
sum = 0;
|
||||
for( z = 0; z < n_velement; z++ )
|
||||
sum += boxes->velement[z].factor *
|
||||
p[seq->start[z]];
|
||||
p += istride;
|
||||
sum = 0;
|
||||
for( z = 0; z < n_vline; z++ ) {
|
||||
seq_sum[z] = 0;
|
||||
for( k = boxes->vline[z].start;
|
||||
k < boxes->vline[z].end; k++ )
|
||||
seq_sum[z] += p[k * istride +
|
||||
boxes->vline[z].band];
|
||||
sum += boxes->vline[z].factor * seq_sum[z];
|
||||
}
|
||||
sum = (sum + boxes->rounding) / boxes->area;
|
||||
CLIP_UCHAR( sum );
|
||||
*q = sum;
|
||||
q += ostride;
|
||||
|
||||
for( y = 1; y < r->height; y++ ) {
|
||||
sum = 0;
|
||||
for( z = 0; z < n_vline; z++ ) {
|
||||
seq_sum[z] += p[seq->end[z]];
|
||||
seq_sum[z] -= p[seq->start[z]];
|
||||
sum += boxes->vline[z].factor * seq_sum[z];
|
||||
}
|
||||
p += istride;
|
||||
sum = (sum + boxes->rounding) / boxes->area;
|
||||
CLIP_UCHAR( sum );
|
||||
*q = sum;
|
||||
q += ostride;
|
||||
}
|
||||
}
|
||||
q += ostride;
|
||||
}
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
@ -1076,10 +1084,14 @@ im_aconv( IMAGE *in, IMAGE *out, DOUBLEMASK *mask, int n_layers, int cluster )
|
||||
IMAGE *t[2];
|
||||
Boxes *boxes;
|
||||
|
||||
printf( "optimising boxes ...\n" );
|
||||
|
||||
if( !(boxes = boxes_new( in, out, mask, n_layers, cluster )) ||
|
||||
im_open_local_array( out, t, 2, "im_aconv", "p" ) )
|
||||
return( -1 );
|
||||
|
||||
printf( "... done\n" );
|
||||
|
||||
/*
|
||||
*/
|
||||
if( im_embed( in, t[0], 1, mask->xsize / 2, mask->ysize / 2,
|
||||
|
Loading…
Reference in New Issue
Block a user