diff --git a/ChangeLog b/ChangeLog index 3a5fe3b3..b977a9e7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -33,6 +33,7 @@ - fix issue thumbnailing RGBA images in linear mode [jjonesrs] - improve vipsthumbnail profile handling - fix tiff deflate predictor setting [Adios] +- fix vector path for composite on i386 [kleisauke] 18/12/20 started 8.10.5 - fix potential /0 in animated webp load [lovell] diff --git a/configure.ac b/configure.ac index 88ece43f..e1cb4002 100644 --- a/configure.ac +++ b/configure.ac @@ -256,7 +256,7 @@ AM_GLIB_GNU_GETTEXT # [ax_gcc_version_option=yes], # [ax_gcc_version_option=no] # ) -AC_MSG_CHECKING([for gcc version]) +AC_MSG_CHECKING([for $CC version]) GCC_VERSION="" version=$($CC -dumpversion) if test $? = 0; then @@ -310,7 +310,7 @@ AC_TYPE_SIZE_T # g++/gcc 4.x and 5.x have rather broken vector support ... 5.4.1 seems to # work, but 5.4.0 fails to even compile -AC_MSG_CHECKING([for gcc with working vector support]) +AC_MSG_CHECKING([for $CC with working vector support]) if test x"$GCC_VERSION_MAJOR" != x"4" -a x"$GCC_VERSION_MAJOR" != x"5"; then AC_MSG_RESULT([yes]) else @@ -323,7 +323,7 @@ if test x"$ax_cv_have_var_attribute_vector_size" = x"yes"; then AC_MSG_CHECKING([for C++ vector shuffle]) AC_LANG_PUSH([C++]) AC_TRY_COMPILE([ - typedef float v4f __attribute__((vector_size(4 * sizeof(float)))); + typedef float v4f __attribute__((vector_size(4 * sizeof(float)),aligned(16))); ],[ v4f f; f[3] = 99; ],[ @@ -346,7 +346,7 @@ if test x"$have_vector_shuffle" = x"yes"; then AC_MSG_CHECKING([for C++ vector arithmetic]) AC_LANG_PUSH([C++]) AC_TRY_COMPILE([ - typedef float v4f __attribute__((vector_size(4 * sizeof(float)))); + typedef float v4f __attribute__((vector_size(4 * sizeof(float)),aligned(16))); ],[ v4f f = {1, 2, 3, 4}; f *= 12.0; v4f g = {5, 6, 7, 8}; f = g > 0 ? g : -1 * g; @@ -366,7 +366,7 @@ if test x"$have_vector_arith" = x"yes"; then AC_MSG_CHECKING([for C++ signed constants in vector templates]) AC_LANG_PUSH([C++]) AC_TRY_COMPILE([ - typedef float v4f __attribute__((vector_size(4 * sizeof(float)))); + typedef float v4f __attribute__((vector_size(4 * sizeof(float)),aligned(16))); template static void h( v4f B ) @@ -393,7 +393,7 @@ fi AC_FUNC_MEMCMP AC_FUNC_MMAP AC_FUNC_VPRINTF -AC_CHECK_FUNCS([getcwd gettimeofday getwd memset munmap putenv realpath strcasecmp strchr strcspn strdup strerror strrchr strspn vsnprintf realpath mkstemp mktemp random rand sysconf atexit]) +AC_CHECK_FUNCS([getcwd gettimeofday getwd memset munmap putenv realpath strcasecmp strchr strcspn strdup strerror strrchr strspn vsnprintf realpath mkstemp mktemp random rand sysconf atexit _aligned_malloc posix_memalign memalign]) AC_CHECK_LIB(m,cbrt,[AC_DEFINE(HAVE_CBRT,1,[have cbrt() in libm.])]) AC_CHECK_LIB(m,hypot,[AC_DEFINE(HAVE_HYPOT,1,[have hypot() in libm.])]) AC_CHECK_LIB(m,atan2,[AC_DEFINE(HAVE_ATAN2,1,[have atan2() in libm.])]) diff --git a/libvips/conversion/composite.cpp b/libvips/conversion/composite.cpp index 946975d5..267e0a56 100644 --- a/libvips/conversion/composite.cpp +++ b/libvips/conversion/composite.cpp @@ -55,13 +55,17 @@ #include #include -#if _MSC_VER +#ifdef _MSC_VER #include #else #include #endif #include +#if defined(HAVE__ALIGNED_MALLOC) || defined(HAVE_MEMALIGN) +#include +#endif + #include #include #include @@ -81,7 +85,7 @@ #ifdef HAVE_VECTOR_ARITH /* A vector of four floats. */ -typedef float v4f __attribute__((vector_size(4 * sizeof(float)))); +typedef float v4f __attribute__((vector_size(4 * sizeof(float)),aligned(16))); #endif /*HAVE_VECTOR_ARITH*/ typedef struct _VipsCompositeBase { @@ -130,12 +134,6 @@ typedef struct _VipsCompositeBase { */ gboolean skippable; -#ifdef HAVE_VECTOR_ARITH - /* max_band as a vector, for the RGBA case. - */ - v4f max_band_vec; -#endif /*HAVE_VECTOR_ARITH*/ - } VipsCompositeBase; typedef VipsConversionClass VipsCompositeBaseClass; @@ -168,6 +166,14 @@ vips_composite_base_dispose( GObject *gobject ) /* Our sequence value. */ typedef struct { +#ifdef HAVE_VECTOR_ARITH + /* max_band as a vector, for the RGBA case. This must be + * defined first to ensure that the member is aligned + * on a 16-byte boundary. + */ + v4f max_band_vec; +#endif /*HAVE_VECTOR_ARITH*/ + VipsCompositeBase *composite; /* Full set of input regions, each made on the corresponding input @@ -196,6 +202,39 @@ typedef struct { } VipsCompositeSequence; +#ifdef HAVE_VECTOR_ARITH +/* Allocate aligned memory. The return value can be released + * by calling the vips_free_aligned() function, for example: + * VIPS_FREEF( vips_free_aligned, ptr ); + */ +static inline void * +vips_alloc_aligned( size_t sz, size_t align ) +{ + g_assert( !(align & (align - 1)) ); +#ifdef HAVE__ALIGNED_MALLOC + return _aligned_malloc( sz, align ); +#elif defined(HAVE_POSIX_MEMALIGN) + void *ptr; + if( posix_memalign( &ptr, align, sz ) ) return NULL; + return ptr; +#elif defined(HAVE_MEMALIGN) + return memalign( align, sz ); +#else +#error Missing aligned alloc implementation +#endif +} + +static inline void +vips_free_aligned( void* ptr ) +{ +#ifdef HAVE__ALIGNED_MALLOC + _aligned_free( ptr ); +#else /*defined(HAVE_POSIX_MEMALIGN) || defined(HAVE_MEMALIGN)*/ + free( ptr ); +#endif +} +#endif /*HAVE_VECTOR_ARITH*/ + static int vips_composite_stop( void *vseq, void *a, void *b ) { @@ -216,7 +255,11 @@ vips_composite_stop( void *vseq, void *a, void *b ) VIPS_FREE( seq->enabled ); VIPS_FREE( seq->p ); +#ifdef HAVE_VECTOR_ARITH + VIPS_FREEF( vips_free_aligned, seq ); +#else /*!defined(HAVE_VECTOR_ARITH)*/ VIPS_FREE( seq ); +#endif /*HAVE_VECTOR_ARITH*/ return( 0 ); } @@ -230,7 +273,14 @@ vips_composite_start( VipsImage *out, void *a, void *b ) VipsCompositeSequence *seq; int i, n; +#ifdef HAVE_VECTOR_ARITH + /* Ensure that the memory is aligned on a 16-byte boundary. + */ + if( !(seq = ((VipsCompositeSequence *) vips_alloc_aligned( + sizeof( VipsCompositeSequence ), 16 ))) ) +#else /*!defined(HAVE_VECTOR_ARITH)*/ if( !(seq = VIPS_NEW( NULL, VipsCompositeSequence )) ) +#endif /*HAVE_VECTOR_ARITH*/ return( NULL ); seq->composite = composite; @@ -280,7 +330,19 @@ vips_composite_start( VipsImage *out, void *a, void *b ) return( NULL ); } } - + +#ifdef HAVE_VECTOR_ARITH + /* We need a float version for the vector path. + */ + if( composite->bands == 3 ) + seq->max_band_vec = (v4f){ + (float) composite->max_band[0], + (float) composite->max_band[1], + (float) composite->max_band[2], + (float) composite->max_band[3] + }; +#endif + return( seq ); } @@ -664,9 +726,11 @@ vips_composite_base_blend( VipsCompositeBase *composite, */ template static void -vips_composite_base_blend3( VipsCompositeBase *composite, +vips_composite_base_blend3( VipsCompositeSequence *seq, VipsBlendMode mode, v4f &B, T * restrict p ) { + VipsCompositeBase *composite = seq->composite; + v4f A; float aA; float aB; @@ -684,7 +748,7 @@ vips_composite_base_blend3( VipsCompositeBase *composite, A[2] = p[2]; A[3] = p[3]; - A /= composite->max_band_vec; + A /= seq->max_band_vec; aA = A[3]; aB = B[3]; @@ -975,7 +1039,7 @@ vips_combine_pixels3( VipsCompositeSequence *seq, VipsPel *q ) /* Scale the base pixel to 0 - 1. */ - B /= composite->max_band_vec; + B /= seq->max_band_vec; aB = B[3]; if( !composite->premultiplied ) { @@ -987,7 +1051,7 @@ vips_combine_pixels3( VipsCompositeSequence *seq, VipsPel *q ) int j = seq->enabled[i]; VipsBlendMode m = n_mode == 1 ? mode[0] : mode[j - 1]; - vips_composite_base_blend3( composite, m, B, tp[i] ); + vips_composite_base_blend3( seq, m, B, tp[i] ); } /* Unpremultiply, if necessary. @@ -1006,7 +1070,7 @@ vips_combine_pixels3( VipsCompositeSequence *seq, VipsPel *q ) /* Write back as a full range pixel, clipping to range. */ - B *= composite->max_band_vec; + B *= seq->max_band_vec; if( min_T != 0 || max_T != 0 ) { float low = min_T; @@ -1386,14 +1450,6 @@ vips_composite_base_build( VipsObject *object ) return( -1 ); } -#ifdef HAVE_VECTOR_ARITH - /* We need a float version for the vector path. - */ - if( composite->bands == 3 ) - for( int b = 0; b <= 3; b++ ) - composite->max_band_vec[b] = composite->max_band[b]; -#endif /*HAVE_VECTOR_ARITH*/ - /* Transform the input images to match in format. We may have * mixed float and double, for example. */