Merge pull request #2144 from kleisauke/ensure-composite-alignment
Ensure max_band vector is aligned on a 16-byte boundary
This commit is contained in:
commit
0f86453f76
12
configure.ac
12
configure.ac
@ -272,7 +272,7 @@ AM_GLIB_GNU_GETTEXT
|
|||||||
# [ax_gcc_version_option=yes],
|
# [ax_gcc_version_option=yes],
|
||||||
# [ax_gcc_version_option=no]
|
# [ax_gcc_version_option=no]
|
||||||
# )
|
# )
|
||||||
AC_MSG_CHECKING([for gcc version])
|
AC_MSG_CHECKING([for $CC version])
|
||||||
GCC_VERSION=""
|
GCC_VERSION=""
|
||||||
version=$($CC -dumpversion)
|
version=$($CC -dumpversion)
|
||||||
if test $? = 0; then
|
if test $? = 0; then
|
||||||
@ -326,7 +326,7 @@ AC_TYPE_SIZE_T
|
|||||||
|
|
||||||
# g++/gcc 4.x and 5.x have rather broken vector support ... 5.4.1 seems to
|
# g++/gcc 4.x and 5.x have rather broken vector support ... 5.4.1 seems to
|
||||||
# work, but 5.4.0 fails to even compile
|
# work, but 5.4.0 fails to even compile
|
||||||
AC_MSG_CHECKING([for gcc with working vector support])
|
AC_MSG_CHECKING([for $CC with working vector support])
|
||||||
if test x"$GCC_VERSION_MAJOR" != x"4" -a x"$GCC_VERSION_MAJOR" != x"5"; then
|
if test x"$GCC_VERSION_MAJOR" != x"4" -a x"$GCC_VERSION_MAJOR" != x"5"; then
|
||||||
AC_MSG_RESULT([yes])
|
AC_MSG_RESULT([yes])
|
||||||
else
|
else
|
||||||
@ -339,7 +339,7 @@ if test x"$ax_cv_have_var_attribute_vector_size" = x"yes"; then
|
|||||||
AC_MSG_CHECKING([for C++ vector shuffle])
|
AC_MSG_CHECKING([for C++ vector shuffle])
|
||||||
AC_LANG_PUSH([C++])
|
AC_LANG_PUSH([C++])
|
||||||
AC_TRY_COMPILE([
|
AC_TRY_COMPILE([
|
||||||
typedef float v4f __attribute__((vector_size(4 * sizeof(float))));
|
typedef float v4f __attribute__((vector_size(4 * sizeof(float)),aligned(16)));
|
||||||
],[
|
],[
|
||||||
v4f f; f[3] = 99;
|
v4f f; f[3] = 99;
|
||||||
],[
|
],[
|
||||||
@ -362,7 +362,7 @@ if test x"$have_vector_shuffle" = x"yes"; then
|
|||||||
AC_MSG_CHECKING([for C++ vector arithmetic])
|
AC_MSG_CHECKING([for C++ vector arithmetic])
|
||||||
AC_LANG_PUSH([C++])
|
AC_LANG_PUSH([C++])
|
||||||
AC_TRY_COMPILE([
|
AC_TRY_COMPILE([
|
||||||
typedef float v4f __attribute__((vector_size(4 * sizeof(float))));
|
typedef float v4f __attribute__((vector_size(4 * sizeof(float)),aligned(16)));
|
||||||
],[
|
],[
|
||||||
v4f f = {1, 2, 3, 4}; f *= 12.0;
|
v4f f = {1, 2, 3, 4}; f *= 12.0;
|
||||||
v4f g = {5, 6, 7, 8}; f = g > 0 ? g : -1 * g;
|
v4f g = {5, 6, 7, 8}; f = g > 0 ? g : -1 * g;
|
||||||
@ -382,7 +382,7 @@ if test x"$have_vector_arith" = x"yes"; then
|
|||||||
AC_MSG_CHECKING([for C++ signed constants in vector templates])
|
AC_MSG_CHECKING([for C++ signed constants in vector templates])
|
||||||
AC_LANG_PUSH([C++])
|
AC_LANG_PUSH([C++])
|
||||||
AC_TRY_COMPILE([
|
AC_TRY_COMPILE([
|
||||||
typedef float v4f __attribute__((vector_size(4 * sizeof(float))));
|
typedef float v4f __attribute__((vector_size(4 * sizeof(float)),aligned(16)));
|
||||||
template <typename T>
|
template <typename T>
|
||||||
static void
|
static void
|
||||||
h( v4f B )
|
h( v4f B )
|
||||||
@ -409,7 +409,7 @@ fi
|
|||||||
AC_FUNC_MEMCMP
|
AC_FUNC_MEMCMP
|
||||||
AC_FUNC_MMAP
|
AC_FUNC_MMAP
|
||||||
AC_FUNC_VPRINTF
|
AC_FUNC_VPRINTF
|
||||||
AC_CHECK_FUNCS([getcwd gettimeofday getwd memset munmap putenv realpath strcasecmp strchr strcspn strdup strerror strrchr strspn vsnprintf realpath mkstemp mktemp random rand sysconf atexit])
|
AC_CHECK_FUNCS([getcwd gettimeofday getwd memset munmap putenv realpath strcasecmp strchr strcspn strdup strerror strrchr strspn vsnprintf realpath mkstemp mktemp random rand sysconf atexit _aligned_malloc posix_memalign memalign])
|
||||||
AC_CHECK_LIB(m,cbrt,[AC_DEFINE(HAVE_CBRT,1,[have cbrt() in libm.])])
|
AC_CHECK_LIB(m,cbrt,[AC_DEFINE(HAVE_CBRT,1,[have cbrt() in libm.])])
|
||||||
AC_CHECK_LIB(m,hypot,[AC_DEFINE(HAVE_HYPOT,1,[have hypot() in libm.])])
|
AC_CHECK_LIB(m,hypot,[AC_DEFINE(HAVE_HYPOT,1,[have hypot() in libm.])])
|
||||||
AC_CHECK_LIB(m,atan2,[AC_DEFINE(HAVE_ATAN2,1,[have atan2() in libm.])])
|
AC_CHECK_LIB(m,atan2,[AC_DEFINE(HAVE_ATAN2,1,[have atan2() in libm.])])
|
||||||
|
@ -55,13 +55,17 @@
|
|||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#if _MSC_VER
|
#ifdef _MSC_VER
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#else
|
#else
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#endif
|
#endif
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
|
#if defined(HAVE__ALIGNED_MALLOC) || defined(HAVE_MEMALIGN)
|
||||||
|
#include <malloc.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <vips/vips.h>
|
#include <vips/vips.h>
|
||||||
#include <vips/internal.h>
|
#include <vips/internal.h>
|
||||||
#include <vips/debug.h>
|
#include <vips/debug.h>
|
||||||
@ -81,7 +85,7 @@
|
|||||||
#ifdef HAVE_VECTOR_ARITH
|
#ifdef HAVE_VECTOR_ARITH
|
||||||
/* A vector of four floats.
|
/* A vector of four floats.
|
||||||
*/
|
*/
|
||||||
typedef float v4f __attribute__((vector_size(4 * sizeof(float))));
|
typedef float v4f __attribute__((vector_size(4 * sizeof(float)),aligned(16)));
|
||||||
#endif /*HAVE_VECTOR_ARITH*/
|
#endif /*HAVE_VECTOR_ARITH*/
|
||||||
|
|
||||||
typedef struct _VipsCompositeBase {
|
typedef struct _VipsCompositeBase {
|
||||||
@ -130,12 +134,6 @@ typedef struct _VipsCompositeBase {
|
|||||||
*/
|
*/
|
||||||
gboolean skippable;
|
gboolean skippable;
|
||||||
|
|
||||||
#ifdef HAVE_VECTOR_ARITH
|
|
||||||
/* max_band as a vector, for the RGBA case.
|
|
||||||
*/
|
|
||||||
v4f max_band_vec;
|
|
||||||
#endif /*HAVE_VECTOR_ARITH*/
|
|
||||||
|
|
||||||
} VipsCompositeBase;
|
} VipsCompositeBase;
|
||||||
|
|
||||||
typedef VipsConversionClass VipsCompositeBaseClass;
|
typedef VipsConversionClass VipsCompositeBaseClass;
|
||||||
@ -168,6 +166,14 @@ vips_composite_base_dispose( GObject *gobject )
|
|||||||
/* Our sequence value.
|
/* Our sequence value.
|
||||||
*/
|
*/
|
||||||
typedef struct {
|
typedef struct {
|
||||||
|
#ifdef HAVE_VECTOR_ARITH
|
||||||
|
/* max_band as a vector, for the RGBA case. This must be
|
||||||
|
* defined first to ensure that the member is aligned
|
||||||
|
* on a 16-byte boundary.
|
||||||
|
*/
|
||||||
|
v4f max_band_vec;
|
||||||
|
#endif /*HAVE_VECTOR_ARITH*/
|
||||||
|
|
||||||
VipsCompositeBase *composite;
|
VipsCompositeBase *composite;
|
||||||
|
|
||||||
/* Full set of input regions, each made on the corresponding input
|
/* Full set of input regions, each made on the corresponding input
|
||||||
@ -196,6 +202,39 @@ typedef struct {
|
|||||||
|
|
||||||
} VipsCompositeSequence;
|
} VipsCompositeSequence;
|
||||||
|
|
||||||
|
#ifdef HAVE_VECTOR_ARITH
|
||||||
|
/* Allocate aligned memory. The return value can be released
|
||||||
|
* by calling the vips_free_aligned() function, for example:
|
||||||
|
* VIPS_FREEF( vips_free_aligned, ptr );
|
||||||
|
*/
|
||||||
|
static inline void *
|
||||||
|
vips_alloc_aligned( size_t sz, size_t align )
|
||||||
|
{
|
||||||
|
g_assert( !(align & (align - 1)) );
|
||||||
|
#ifdef HAVE__ALIGNED_MALLOC
|
||||||
|
return _aligned_malloc( sz, align );
|
||||||
|
#elif defined(HAVE_POSIX_MEMALIGN)
|
||||||
|
void *ptr;
|
||||||
|
if( posix_memalign( &ptr, align, sz ) ) return NULL;
|
||||||
|
return ptr;
|
||||||
|
#elif defined(HAVE_MEMALIGN)
|
||||||
|
return memalign( align, sz );
|
||||||
|
#else
|
||||||
|
#error Missing aligned alloc implementation
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void
|
||||||
|
vips_free_aligned( void* ptr )
|
||||||
|
{
|
||||||
|
#ifdef HAVE__ALIGNED_MALLOC
|
||||||
|
_aligned_free( ptr );
|
||||||
|
#else /*defined(HAVE_POSIX_MEMALIGN) || defined(HAVE_MEMALIGN)*/
|
||||||
|
free( ptr );
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#endif /*HAVE_VECTOR_ARITH*/
|
||||||
|
|
||||||
static int
|
static int
|
||||||
vips_composite_stop( void *vseq, void *a, void *b )
|
vips_composite_stop( void *vseq, void *a, void *b )
|
||||||
{
|
{
|
||||||
@ -216,7 +255,11 @@ vips_composite_stop( void *vseq, void *a, void *b )
|
|||||||
VIPS_FREE( seq->enabled );
|
VIPS_FREE( seq->enabled );
|
||||||
VIPS_FREE( seq->p );
|
VIPS_FREE( seq->p );
|
||||||
|
|
||||||
|
#ifdef HAVE_VECTOR_ARITH
|
||||||
|
VIPS_FREEF( vips_free_aligned, seq );
|
||||||
|
#else /*!defined(HAVE_VECTOR_ARITH)*/
|
||||||
VIPS_FREE( seq );
|
VIPS_FREE( seq );
|
||||||
|
#endif /*HAVE_VECTOR_ARITH*/
|
||||||
|
|
||||||
return( 0 );
|
return( 0 );
|
||||||
}
|
}
|
||||||
@ -230,7 +273,14 @@ vips_composite_start( VipsImage *out, void *a, void *b )
|
|||||||
VipsCompositeSequence *seq;
|
VipsCompositeSequence *seq;
|
||||||
int i, n;
|
int i, n;
|
||||||
|
|
||||||
|
#ifdef HAVE_VECTOR_ARITH
|
||||||
|
/* Ensure that the memory is aligned on a 16-byte boundary.
|
||||||
|
*/
|
||||||
|
if( !(seq = ((VipsCompositeSequence *) vips_alloc_aligned(
|
||||||
|
sizeof( VipsCompositeSequence ), 16 ))) )
|
||||||
|
#else /*!defined(HAVE_VECTOR_ARITH)*/
|
||||||
if( !(seq = VIPS_NEW( NULL, VipsCompositeSequence )) )
|
if( !(seq = VIPS_NEW( NULL, VipsCompositeSequence )) )
|
||||||
|
#endif /*HAVE_VECTOR_ARITH*/
|
||||||
return( NULL );
|
return( NULL );
|
||||||
|
|
||||||
seq->composite = composite;
|
seq->composite = composite;
|
||||||
@ -281,6 +331,18 @@ vips_composite_start( VipsImage *out, void *a, void *b )
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef HAVE_VECTOR_ARITH
|
||||||
|
/* We need a float version for the vector path.
|
||||||
|
*/
|
||||||
|
if( composite->bands == 3 )
|
||||||
|
seq->max_band_vec = (v4f){
|
||||||
|
(float) composite->max_band[0],
|
||||||
|
(float) composite->max_band[1],
|
||||||
|
(float) composite->max_band[2],
|
||||||
|
(float) composite->max_band[3]
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
return( seq );
|
return( seq );
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -664,9 +726,11 @@ vips_composite_base_blend( VipsCompositeBase *composite,
|
|||||||
*/
|
*/
|
||||||
template <typename T>
|
template <typename T>
|
||||||
static void
|
static void
|
||||||
vips_composite_base_blend3( VipsCompositeBase *composite,
|
vips_composite_base_blend3( VipsCompositeSequence *seq,
|
||||||
VipsBlendMode mode, v4f &B, T * restrict p )
|
VipsBlendMode mode, v4f &B, T * restrict p )
|
||||||
{
|
{
|
||||||
|
VipsCompositeBase *composite = seq->composite;
|
||||||
|
|
||||||
v4f A;
|
v4f A;
|
||||||
float aA;
|
float aA;
|
||||||
float aB;
|
float aB;
|
||||||
@ -684,7 +748,7 @@ vips_composite_base_blend3( VipsCompositeBase *composite,
|
|||||||
A[2] = p[2];
|
A[2] = p[2];
|
||||||
A[3] = p[3];
|
A[3] = p[3];
|
||||||
|
|
||||||
A /= composite->max_band_vec;
|
A /= seq->max_band_vec;
|
||||||
|
|
||||||
aA = A[3];
|
aA = A[3];
|
||||||
aB = B[3];
|
aB = B[3];
|
||||||
@ -975,7 +1039,7 @@ vips_combine_pixels3( VipsCompositeSequence *seq, VipsPel *q )
|
|||||||
|
|
||||||
/* Scale the base pixel to 0 - 1.
|
/* Scale the base pixel to 0 - 1.
|
||||||
*/
|
*/
|
||||||
B /= composite->max_band_vec;
|
B /= seq->max_band_vec;
|
||||||
aB = B[3];
|
aB = B[3];
|
||||||
|
|
||||||
if( !composite->premultiplied ) {
|
if( !composite->premultiplied ) {
|
||||||
@ -987,7 +1051,7 @@ vips_combine_pixels3( VipsCompositeSequence *seq, VipsPel *q )
|
|||||||
int j = seq->enabled[i];
|
int j = seq->enabled[i];
|
||||||
VipsBlendMode m = n_mode == 1 ? mode[0] : mode[j - 1];
|
VipsBlendMode m = n_mode == 1 ? mode[0] : mode[j - 1];
|
||||||
|
|
||||||
vips_composite_base_blend3<T>( composite, m, B, tp[i] );
|
vips_composite_base_blend3<T>( seq, m, B, tp[i] );
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Unpremultiply, if necessary.
|
/* Unpremultiply, if necessary.
|
||||||
@ -1006,7 +1070,7 @@ vips_combine_pixels3( VipsCompositeSequence *seq, VipsPel *q )
|
|||||||
|
|
||||||
/* Write back as a full range pixel, clipping to range.
|
/* Write back as a full range pixel, clipping to range.
|
||||||
*/
|
*/
|
||||||
B *= composite->max_band_vec;
|
B *= seq->max_band_vec;
|
||||||
if( min_T != 0 ||
|
if( min_T != 0 ||
|
||||||
max_T != 0 ) {
|
max_T != 0 ) {
|
||||||
float low = min_T;
|
float low = min_T;
|
||||||
@ -1386,14 +1450,6 @@ vips_composite_base_build( VipsObject *object )
|
|||||||
return( -1 );
|
return( -1 );
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef HAVE_VECTOR_ARITH
|
|
||||||
/* We need a float version for the vector path.
|
|
||||||
*/
|
|
||||||
if( composite->bands == 3 )
|
|
||||||
for( int b = 0; b <= 3; b++ )
|
|
||||||
composite->max_band_vec[b] = composite->max_band[b];
|
|
||||||
#endif /*HAVE_VECTOR_ARITH*/
|
|
||||||
|
|
||||||
/* Transform the input images to match in format. We may have
|
/* Transform the input images to match in format. We may have
|
||||||
* mixed float and double, for example.
|
* mixed float and double, for example.
|
||||||
*/
|
*/
|
||||||
|
Loading…
Reference in New Issue
Block a user