more im_aconv() speedups
now uses rolling boxes for the vertical pass too
This commit is contained in:
parent
4e3e0cee6c
commit
953a315755
14
TODO
14
TODO
|
@ -1,17 +1,3 @@
|
||||||
- read_header() from im_read_dmask() annoys valgrind
|
|
||||||
|
|
||||||
==14466== Use of uninitialised value of size 8
|
|
||||||
==14466== at 0xA0BE850: __strspn_sse42 (emmintrin.h:685)
|
|
||||||
==14466== by 0x4F815AB: vips_break_token (util.c:373)
|
|
||||||
==14466== by 0x4F87F68: read_header (rw_mask.c:412)
|
|
||||||
==14466== by 0x4F88198: im_read_dmask (rw_mask.c:488)
|
|
||||||
==14466== by 0x4F01C1F: dmask_init (dispatch_types.c:263)
|
|
||||||
==14466== by 0x4F03AA2: build_args (package.c:956)
|
|
||||||
==14466== by 0x4F03EBC: dispatch_function (package.c:1068)
|
|
||||||
==14466== by 0x4F0403C: im_run_command (package.c:1133)
|
|
||||||
==14466== by 0x403997: main (vips.c:1074)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
- revisit orc conv
|
- revisit orc conv
|
||||||
|
|
|
@ -53,29 +53,20 @@
|
||||||
timing:
|
timing:
|
||||||
|
|
||||||
$ time vips im_conv_f img_0075.jpg x2.v g2d201.con
|
$ time vips im_conv_f img_0075.jpg x2.v g2d201.con
|
||||||
real 11m58.769s
|
real 5m3.359s
|
||||||
user 22m46.390s
|
user 9m34.700s
|
||||||
sys 0m3.270s
|
sys 0m1.500s
|
||||||
|
|
||||||
$ time vips im_aconv img_0075.jpg x.v g2d201.con 10 10
|
$ time vips im_aconv img_0075.jpg x.v g2d201.con 10 10
|
||||||
boxes_new: min = 0, max = 1
|
real 0m4.877s
|
||||||
boxes_new: depth = 0.1, n_layers = 10
|
user 0m7.490s
|
||||||
boxes_new: generated 1130 boxes
|
sys 0m0.220s
|
||||||
boxes_new: clustering with thresh 10 ...
|
|
||||||
boxes_new: renumbering ...
|
|
||||||
boxes_new: after renumbering, 14 boxes remain
|
|
||||||
real 0m34.377s
|
|
||||||
user 1m0.440s
|
|
||||||
sys 0m0.370s
|
|
||||||
|
|
||||||
$ vips im_subtract x.v x2.v diff.v
|
$ vips im_subtract x.v x2.v diff.v
|
||||||
$ vips im_abs diff.v abs.v
|
$ vips im_abs diff.v abs.v
|
||||||
$ vips im_max abs.v
|
$ vips im_max abs.v
|
||||||
2.70833
|
2.70833
|
||||||
|
|
||||||
- can we use rolling averages for the vertical pass?
|
|
||||||
we need to search for groups with the same band and adjacent row
|
|
||||||
|
|
||||||
- clustering could be much faster
|
- clustering could be much faster
|
||||||
|
|
||||||
- add more bandfmt
|
- add more bandfmt
|
||||||
|
@ -85,9 +76,9 @@ $ vips im_max abs.v
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/*
|
/*
|
||||||
*/
|
|
||||||
#define DEBUG
|
#define DEBUG
|
||||||
#define VIPS_DEBUG
|
#define VIPS_DEBUG
|
||||||
|
*/
|
||||||
|
|
||||||
#ifdef HAVE_CONFIG_H
|
#ifdef HAVE_CONFIG_H
|
||||||
#include <config.h>
|
#include <config.h>
|
||||||
|
@ -319,12 +310,6 @@ boxes_renumber( Boxes *boxes )
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
|
|
||||||
j = 0;
|
|
||||||
for( i = 0; i < boxes->n_hline; i++ )
|
|
||||||
if( boxes->hline[i].weight == 0 )
|
|
||||||
j++;
|
|
||||||
printf( "%d weight 0 hlines\n", j );
|
|
||||||
|
|
||||||
/* Loop for all zero-weight hlines.
|
/* Loop for all zero-weight hlines.
|
||||||
*/
|
*/
|
||||||
for( i = 0; i < boxes->n_hline; ) {
|
for( i = 0; i < boxes->n_hline; ) {
|
||||||
|
@ -560,8 +545,10 @@ boxes_new( IMAGE *in, IMAGE *out, DOUBLEMASK *mask, int n_layers, int cluster )
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef DEBUG
|
||||||
VIPS_DEBUG_MSG( "boxes_new: generated %d boxes\n", boxes->n_hline );
|
VIPS_DEBUG_MSG( "boxes_new: generated %d boxes\n", boxes->n_hline );
|
||||||
boxes_print( boxes );
|
boxes_print( boxes );
|
||||||
|
#endif /*DEBUG*/
|
||||||
|
|
||||||
VIPS_DEBUG_MSG( "boxes_new: clustering with thresh %d ...\n",
|
VIPS_DEBUG_MSG( "boxes_new: clustering with thresh %d ...\n",
|
||||||
cluster );
|
cluster );
|
||||||
|
@ -628,18 +615,17 @@ typedef struct {
|
||||||
|
|
||||||
REGION *ir; /* Input region */
|
REGION *ir; /* Input region */
|
||||||
|
|
||||||
/* For the horizontal pass, offsets for start and stop. For the
|
/* Offsets for start and stop.
|
||||||
* vertical pass, just use just start to get the offsets to sum.
|
|
||||||
*/
|
*/
|
||||||
int *start;
|
int *start;
|
||||||
int *end;
|
int *end;
|
||||||
|
|
||||||
/* For the horizontal pass, the rolling sums. int for integer types,
|
int last_stride; /* Avoid recalcing offsets, if we can */
|
||||||
|
|
||||||
|
/* The rolling sums. int for integer types,
|
||||||
* double for floating point types.
|
* double for floating point types.
|
||||||
*/
|
*/
|
||||||
void *sum;
|
void *sum;
|
||||||
|
|
||||||
int last_stride; /* Avoid recalcing offsets, if we can */
|
|
||||||
} AConvSequence;
|
} AConvSequence;
|
||||||
|
|
||||||
/* Free a sequence value.
|
/* Free a sequence value.
|
||||||
|
@ -675,6 +661,7 @@ aconv_start( IMAGE *out, void *a, void *b )
|
||||||
/* n_velement should be the largest possible dimension.
|
/* n_velement should be the largest possible dimension.
|
||||||
*/
|
*/
|
||||||
g_assert( boxes->n_velement >= boxes->n_hline );
|
g_assert( boxes->n_velement >= boxes->n_hline );
|
||||||
|
g_assert( boxes->n_velement >= boxes->n_vline );
|
||||||
|
|
||||||
seq->start = IM_ARRAY( out, boxes->n_velement, int );
|
seq->start = IM_ARRAY( out, boxes->n_velement, int );
|
||||||
seq->end = IM_ARRAY( out, boxes->n_velement, int );
|
seq->end = IM_ARRAY( out, boxes->n_velement, int );
|
||||||
|
@ -904,7 +891,7 @@ aconv_vgenerate( REGION *or, void *vseq, void *a, void *b )
|
||||||
Boxes *boxes = (Boxes *) b;
|
Boxes *boxes = (Boxes *) b;
|
||||||
|
|
||||||
REGION *ir = seq->ir;
|
REGION *ir = seq->ir;
|
||||||
const int n_velement = boxes->n_velement;
|
const int n_vline = boxes->n_vline;
|
||||||
DOUBLEMASK *mask = boxes->mask;
|
DOUBLEMASK *mask = boxes->mask;
|
||||||
Rect *r = &or->valid;
|
Rect *r = &or->valid;
|
||||||
|
|
||||||
|
@ -914,7 +901,7 @@ aconv_vgenerate( REGION *or, void *vseq, void *a, void *b )
|
||||||
2 * IM_REGION_N_ELEMENTS( or ) : IM_REGION_N_ELEMENTS( or );
|
2 * IM_REGION_N_ELEMENTS( or ) : IM_REGION_N_ELEMENTS( or );
|
||||||
|
|
||||||
Rect s;
|
Rect s;
|
||||||
int x, y, z;
|
int x, y, z, k;
|
||||||
int istride;
|
int istride;
|
||||||
int ostride;
|
int ostride;
|
||||||
|
|
||||||
|
@ -936,15 +923,20 @@ aconv_vgenerate( REGION *or, void *vseq, void *a, void *b )
|
||||||
if( seq->last_stride != istride ) {
|
if( seq->last_stride != istride ) {
|
||||||
seq->last_stride = istride;
|
seq->last_stride = istride;
|
||||||
|
|
||||||
for( z = 0; z < n_velement; z++ )
|
for( z = 0; z < n_vline; z++ ) {
|
||||||
seq->start[z] = boxes->velement[z].band +
|
seq->start[z] = boxes->vline[z].band +
|
||||||
boxes->velement[z].row * istride;
|
boxes->vline[z].start * istride;
|
||||||
|
seq->end[z] = boxes->vline[z].band +
|
||||||
|
boxes->vline[z].end * istride;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
switch( boxes->in->BandFmt ) {
|
switch( boxes->in->BandFmt ) {
|
||||||
case IM_BANDFMT_UCHAR:
|
case IM_BANDFMT_UCHAR:
|
||||||
|
|
||||||
for( x = 0; x < sz; x++ ) {
|
for( x = 0; x < sz; x++ ) {
|
||||||
|
int *seq_sum = (int *) seq->sum;
|
||||||
|
|
||||||
int *p;
|
int *p;
|
||||||
PEL *q;
|
PEL *q;
|
||||||
int sum;
|
int sum;
|
||||||
|
@ -953,18 +945,34 @@ aconv_vgenerate( REGION *or, void *vseq, void *a, void *b )
|
||||||
(int *) IM_REGION_ADDR( ir, r->left, r->top );
|
(int *) IM_REGION_ADDR( ir, r->left, r->top );
|
||||||
q = x + (PEL *) IM_REGION_ADDR( or, r->left, r->top );
|
q = x + (PEL *) IM_REGION_ADDR( or, r->left, r->top );
|
||||||
|
|
||||||
for( y = 0; y < r->height; y++ ) {
|
sum = 0;
|
||||||
sum = 0;
|
for( z = 0; z < n_vline; z++ ) {
|
||||||
for( z = 0; z < n_velement; z++ )
|
seq_sum[z] = 0;
|
||||||
sum += boxes->velement[z].factor *
|
for( k = boxes->vline[z].start;
|
||||||
p[seq->start[z]];
|
k < boxes->vline[z].end; k++ )
|
||||||
p += istride;
|
seq_sum[z] += p[k * istride +
|
||||||
|
boxes->vline[z].band];
|
||||||
|
sum += boxes->vline[z].factor * seq_sum[z];
|
||||||
|
}
|
||||||
|
sum = (sum + boxes->rounding) / boxes->area;
|
||||||
|
CLIP_UCHAR( sum );
|
||||||
|
*q = sum;
|
||||||
|
q += ostride;
|
||||||
|
|
||||||
|
for( y = 1; y < r->height; y++ ) {
|
||||||
|
sum = 0;
|
||||||
|
for( z = 0; z < n_vline; z++ ) {
|
||||||
|
seq_sum[z] += p[seq->end[z]];
|
||||||
|
seq_sum[z] -= p[seq->start[z]];
|
||||||
|
sum += boxes->vline[z].factor * seq_sum[z];
|
||||||
|
}
|
||||||
|
p += istride;
|
||||||
sum = (sum + boxes->rounding) / boxes->area;
|
sum = (sum + boxes->rounding) / boxes->area;
|
||||||
CLIP_UCHAR( sum );
|
CLIP_UCHAR( sum );
|
||||||
*q = sum;
|
*q = sum;
|
||||||
q += ostride;
|
q += ostride;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
@ -1076,10 +1084,14 @@ im_aconv( IMAGE *in, IMAGE *out, DOUBLEMASK *mask, int n_layers, int cluster )
|
||||||
IMAGE *t[2];
|
IMAGE *t[2];
|
||||||
Boxes *boxes;
|
Boxes *boxes;
|
||||||
|
|
||||||
|
printf( "optimising boxes ...\n" );
|
||||||
|
|
||||||
if( !(boxes = boxes_new( in, out, mask, n_layers, cluster )) ||
|
if( !(boxes = boxes_new( in, out, mask, n_layers, cluster )) ||
|
||||||
im_open_local_array( out, t, 2, "im_aconv", "p" ) )
|
im_open_local_array( out, t, 2, "im_aconv", "p" ) )
|
||||||
return( -1 );
|
return( -1 );
|
||||||
|
|
||||||
|
printf( "... done\n" );
|
||||||
|
|
||||||
/*
|
/*
|
||||||
*/
|
*/
|
||||||
if( im_embed( in, t[0], 1, mask->xsize / 2, mask->ysize / 2,
|
if( im_embed( in, t[0], 1, mask->xsize / 2, mask->ysize / 2,
|
||||||
|
|
Loading…
Reference in New Issue