From a55513a194eb65c2f9b6197a8357b30740427160 Mon Sep 17 00:00:00 2001 From: Kleis Auke Wolthuizen Date: Fri, 12 Mar 2021 12:57:36 +0100 Subject: [PATCH 1/4] Ensure max_band vector is aligned on a 16-byte boundary See https://github.com/mstorsjo/llvm-mingw/issues/190 --- libvips/conversion/composite.cpp | 87 ++++++++++++++++++++++++-------- 1 file changed, 65 insertions(+), 22 deletions(-) diff --git a/libvips/conversion/composite.cpp b/libvips/conversion/composite.cpp index 946975d5..b9292d60 100644 --- a/libvips/conversion/composite.cpp +++ b/libvips/conversion/composite.cpp @@ -130,12 +130,6 @@ typedef struct _VipsCompositeBase { */ gboolean skippable; -#ifdef HAVE_VECTOR_ARITH - /* max_band as a vector, for the RGBA case. - */ - v4f max_band_vec; -#endif /*HAVE_VECTOR_ARITH*/ - } VipsCompositeBase; typedef VipsConversionClass VipsCompositeBaseClass; @@ -194,6 +188,16 @@ typedef struct { */ VipsPel **p; +#ifdef HAVE_VECTOR_ARITH + /* A pointer to the 'real' memory. + */ + void *mem; + + /* max_band as a vector, for the RGBA case. + */ + v4f max_band_vec; +#endif /*HAVE_VECTOR_ARITH*/ + } VipsCompositeSequence; static int @@ -216,7 +220,14 @@ vips_composite_stop( void *vseq, void *a, void *b ) VIPS_FREE( seq->enabled ); VIPS_FREE( seq->p ); +#ifdef HAVE_VECTOR_ARITH + /* Must use g_free here, otherwise we end up writing to a + * pointer that we just freed. + */ + g_free( seq->mem ); +#else /*!defined(HAVE_VECTOR_ARITH)*/ VIPS_FREE( seq ); +#endif /*HAVE_VECTOR_ARITH*/ return( 0 ); } @@ -227,12 +238,38 @@ vips_composite_start( VipsImage *out, void *a, void *b ) VipsImage **in = (VipsImage **) a; VipsCompositeBase *composite = (VipsCompositeBase *) b; + void *mem; VipsCompositeSequence *seq; - int i, n; + int i, n, size; - if( !(seq = VIPS_NEW( NULL, VipsCompositeSequence )) ) + /* The size of our struct. + */ + size = sizeof( VipsCompositeSequence ); + +#ifdef HAVE_VECTOR_ARITH + /* Ensure that the memory is aligned on a 16-byte boundary. + */ + size += 16 - 1; +#endif /*HAVE_VECTOR_ARITH*/ + + /* Allocate a new chunk of memory. + */ + if( !(mem = vips_malloc( NULL, size )) ) return( NULL ); +#ifdef HAVE_VECTOR_ARITH + /* Our aligned pointer. + */ + seq = (VipsCompositeSequence *) + (((guintptr) mem + 15) & ~(guintptr) 0x0F); + + /* Store the pointer to the 'real' memory. + */ + seq->mem = mem; +#else /*!defined(HAVE_VECTOR_ARITH)*/ + seq = (VipsCompositeSequence *) mem; +#endif /*HAVE_VECTOR_ARITH*/ + seq->composite = composite; seq->input_regions = NULL; seq->enabled = NULL; @@ -280,7 +317,19 @@ vips_composite_start( VipsImage *out, void *a, void *b ) return( NULL ); } } - + +#ifdef HAVE_VECTOR_ARITH + /* We need a float version for the vector path. + */ + if( composite->bands == 3 ) + seq->max_band_vec = (v4f){ + (float) composite->max_band[0], + (float) composite->max_band[1], + (float) composite->max_band[2], + (float) composite->max_band[3] + }; +#endif + return( seq ); } @@ -664,9 +713,11 @@ vips_composite_base_blend( VipsCompositeBase *composite, */ template static void -vips_composite_base_blend3( VipsCompositeBase *composite, +vips_composite_base_blend3( VipsCompositeSequence *seq, VipsBlendMode mode, v4f &B, T * restrict p ) { + VipsCompositeBase *composite = seq->composite; + v4f A; float aA; float aB; @@ -684,7 +735,7 @@ vips_composite_base_blend3( VipsCompositeBase *composite, A[2] = p[2]; A[3] = p[3]; - A /= composite->max_band_vec; + A /= seq->max_band_vec; aA = A[3]; aB = B[3]; @@ -975,7 +1026,7 @@ vips_combine_pixels3( VipsCompositeSequence *seq, VipsPel *q ) /* Scale the base pixel to 0 - 1. */ - B /= composite->max_band_vec; + B /= seq->max_band_vec; aB = B[3]; if( !composite->premultiplied ) { @@ -987,7 +1038,7 @@ vips_combine_pixels3( VipsCompositeSequence *seq, VipsPel *q ) int j = seq->enabled[i]; VipsBlendMode m = n_mode == 1 ? mode[0] : mode[j - 1]; - vips_composite_base_blend3( composite, m, B, tp[i] ); + vips_composite_base_blend3( seq, m, B, tp[i] ); } /* Unpremultiply, if necessary. @@ -1006,7 +1057,7 @@ vips_combine_pixels3( VipsCompositeSequence *seq, VipsPel *q ) /* Write back as a full range pixel, clipping to range. */ - B *= composite->max_band_vec; + B *= seq->max_band_vec; if( min_T != 0 || max_T != 0 ) { float low = min_T; @@ -1386,14 +1437,6 @@ vips_composite_base_build( VipsObject *object ) return( -1 ); } -#ifdef HAVE_VECTOR_ARITH - /* We need a float version for the vector path. - */ - if( composite->bands == 3 ) - for( int b = 0; b <= 3; b++ ) - composite->max_band_vec[b] = composite->max_band[b]; -#endif /*HAVE_VECTOR_ARITH*/ - /* Transform the input images to match in format. We may have * mixed float and double, for example. */ From 5ef9c84f97f84936676823b7c3d9998c9738d6fb Mon Sep 17 00:00:00 2001 From: Kleis Auke Wolthuizen Date: Mon, 15 Mar 2021 11:10:16 +0100 Subject: [PATCH 2/4] Use cross-platform functions for allocating aligned memory A malloc library is expected to provide a better implementation. --- configure.ac | 2 +- libvips/conversion/composite.cpp | 80 ++++++++++++++++++-------------- 2 files changed, 47 insertions(+), 35 deletions(-) diff --git a/configure.ac b/configure.ac index 2b928842..176bee11 100644 --- a/configure.ac +++ b/configure.ac @@ -409,7 +409,7 @@ fi AC_FUNC_MEMCMP AC_FUNC_MMAP AC_FUNC_VPRINTF -AC_CHECK_FUNCS([getcwd gettimeofday getwd memset munmap putenv realpath strcasecmp strchr strcspn strdup strerror strrchr strspn vsnprintf realpath mkstemp mktemp random rand sysconf atexit]) +AC_CHECK_FUNCS([getcwd gettimeofday getwd memset munmap putenv realpath strcasecmp strchr strcspn strdup strerror strrchr strspn vsnprintf realpath mkstemp mktemp random rand sysconf atexit _aligned_malloc posix_memalign memalign]) AC_CHECK_LIB(m,cbrt,[AC_DEFINE(HAVE_CBRT,1,[have cbrt() in libm.])]) AC_CHECK_LIB(m,hypot,[AC_DEFINE(HAVE_HYPOT,1,[have hypot() in libm.])]) AC_CHECK_LIB(m,atan2,[AC_DEFINE(HAVE_ATAN2,1,[have atan2() in libm.])]) diff --git a/libvips/conversion/composite.cpp b/libvips/conversion/composite.cpp index b9292d60..f083d60e 100644 --- a/libvips/conversion/composite.cpp +++ b/libvips/conversion/composite.cpp @@ -55,13 +55,17 @@ #include #include -#if _MSC_VER +#ifdef _MSC_VER #include #else #include #endif #include +#if defined(HAVE__ALIGNED_MALLOC) || defined(HAVE_MEMALIGN) +#include +#endif + #include #include #include @@ -159,7 +163,8 @@ vips_composite_base_dispose( GObject *gobject ) G_OBJECT_CLASS( vips_composite_base_parent_class )->dispose( gobject ); } -/* Our sequence value. +/* Our sequence value. This must be aligned on a 16-byte boundary when + * HAVE_VECTOR_ARITH is defined. */ typedef struct { VipsCompositeBase *composite; @@ -189,10 +194,6 @@ typedef struct { VipsPel **p; #ifdef HAVE_VECTOR_ARITH - /* A pointer to the 'real' memory. - */ - void *mem; - /* max_band as a vector, for the RGBA case. */ v4f max_band_vec; @@ -200,6 +201,39 @@ typedef struct { } VipsCompositeSequence; +#ifdef HAVE_VECTOR_ARITH +/* Allocate aligned memory. The return value can be released + * by calling the vips_free_aligned() function, for example: + * VIPS_FREEF( vips_free_aligned, ptr ); + */ +static inline void * +vips_alloc_aligned( size_t sz, size_t align ) +{ + g_assert( !(align & (align - 1)) ); +#ifdef HAVE__ALIGNED_MALLOC + return _aligned_malloc( sz, align ); +#elif defined(HAVE_POSIX_MEMALIGN) + void *ptr; + if( posix_memalign( &ptr, align, sz ) ) return NULL; + return ptr; +#elif defined(HAVE_MEMALIGN) + return memalign( align, sz ); +#else +#error Missing aligned alloc implementation +#endif +} + +static inline void +vips_free_aligned( void* ptr ) +{ +#ifdef HAVE__ALIGNED_MALLOC + _aligned_free( ptr ); +#else /*defined(HAVE_POSIX_MEMALIGN) || defined(HAVE_MEMALIGN)*/ + free( ptr ); +#endif +} +#endif /*HAVE_VECTOR_ARITH*/ + static int vips_composite_stop( void *vseq, void *a, void *b ) { @@ -221,10 +255,7 @@ vips_composite_stop( void *vseq, void *a, void *b ) VIPS_FREE( seq->p ); #ifdef HAVE_VECTOR_ARITH - /* Must use g_free here, otherwise we end up writing to a - * pointer that we just freed. - */ - g_free( seq->mem ); + VIPS_FREEF( vips_free_aligned, seq ); #else /*!defined(HAVE_VECTOR_ARITH)*/ VIPS_FREE( seq ); #endif /*HAVE_VECTOR_ARITH*/ @@ -238,37 +269,18 @@ vips_composite_start( VipsImage *out, void *a, void *b ) VipsImage **in = (VipsImage **) a; VipsCompositeBase *composite = (VipsCompositeBase *) b; - void *mem; VipsCompositeSequence *seq; - int i, n, size; - - /* The size of our struct. - */ - size = sizeof( VipsCompositeSequence ); + int i, n; #ifdef HAVE_VECTOR_ARITH /* Ensure that the memory is aligned on a 16-byte boundary. */ - size += 16 - 1; -#endif /*HAVE_VECTOR_ARITH*/ - - /* Allocate a new chunk of memory. - */ - if( !(mem = vips_malloc( NULL, size )) ) - return( NULL ); - -#ifdef HAVE_VECTOR_ARITH - /* Our aligned pointer. - */ - seq = (VipsCompositeSequence *) - (((guintptr) mem + 15) & ~(guintptr) 0x0F); - - /* Store the pointer to the 'real' memory. - */ - seq->mem = mem; + if( !(seq = ((VipsCompositeSequence *) vips_alloc_aligned( + sizeof( VipsCompositeSequence ), 16 ))) ) #else /*!defined(HAVE_VECTOR_ARITH)*/ - seq = (VipsCompositeSequence *) mem; + if( !(seq = VIPS_NEW( NULL, VipsCompositeSequence )) ) #endif /*HAVE_VECTOR_ARITH*/ + return( NULL ); seq->composite = composite; seq->input_regions = NULL; From e7faebf6af6b77c34d04a4e4efb7e47f124b8d39 Mon Sep 17 00:00:00 2001 From: Kleis Auke Wolthuizen Date: Sat, 20 Mar 2021 11:33:37 +0100 Subject: [PATCH 3/4] Tell the compiler that v4f is aligned It's guaranteed that this is now aligned on a 16-byte boundary. --- configure.ac | 10 +++++----- libvips/conversion/composite.cpp | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/configure.ac b/configure.ac index 176bee11..bae1cb74 100644 --- a/configure.ac +++ b/configure.ac @@ -272,7 +272,7 @@ AM_GLIB_GNU_GETTEXT # [ax_gcc_version_option=yes], # [ax_gcc_version_option=no] # ) -AC_MSG_CHECKING([for gcc version]) +AC_MSG_CHECKING([for $CC version]) GCC_VERSION="" version=$($CC -dumpversion) if test $? = 0; then @@ -326,7 +326,7 @@ AC_TYPE_SIZE_T # g++/gcc 4.x and 5.x have rather broken vector support ... 5.4.1 seems to # work, but 5.4.0 fails to even compile -AC_MSG_CHECKING([for gcc with working vector support]) +AC_MSG_CHECKING([for $CC with working vector support]) if test x"$GCC_VERSION_MAJOR" != x"4" -a x"$GCC_VERSION_MAJOR" != x"5"; then AC_MSG_RESULT([yes]) else @@ -339,7 +339,7 @@ if test x"$ax_cv_have_var_attribute_vector_size" = x"yes"; then AC_MSG_CHECKING([for C++ vector shuffle]) AC_LANG_PUSH([C++]) AC_TRY_COMPILE([ - typedef float v4f __attribute__((vector_size(4 * sizeof(float)))); + typedef float v4f __attribute__((vector_size(4 * sizeof(float)),aligned(16))); ],[ v4f f; f[3] = 99; ],[ @@ -362,7 +362,7 @@ if test x"$have_vector_shuffle" = x"yes"; then AC_MSG_CHECKING([for C++ vector arithmetic]) AC_LANG_PUSH([C++]) AC_TRY_COMPILE([ - typedef float v4f __attribute__((vector_size(4 * sizeof(float)))); + typedef float v4f __attribute__((vector_size(4 * sizeof(float)),aligned(16))); ],[ v4f f = {1, 2, 3, 4}; f *= 12.0; v4f g = {5, 6, 7, 8}; f = g > 0 ? g : -1 * g; @@ -382,7 +382,7 @@ if test x"$have_vector_arith" = x"yes"; then AC_MSG_CHECKING([for C++ signed constants in vector templates]) AC_LANG_PUSH([C++]) AC_TRY_COMPILE([ - typedef float v4f __attribute__((vector_size(4 * sizeof(float)))); + typedef float v4f __attribute__((vector_size(4 * sizeof(float)),aligned(16))); template static void h( v4f B ) diff --git a/libvips/conversion/composite.cpp b/libvips/conversion/composite.cpp index f083d60e..f92d82ec 100644 --- a/libvips/conversion/composite.cpp +++ b/libvips/conversion/composite.cpp @@ -85,7 +85,7 @@ #ifdef HAVE_VECTOR_ARITH /* A vector of four floats. */ -typedef float v4f __attribute__((vector_size(4 * sizeof(float)))); +typedef float v4f __attribute__((vector_size(4 * sizeof(float)),aligned(16))); #endif /*HAVE_VECTOR_ARITH*/ typedef struct _VipsCompositeBase { From 305714c978349e5fffb82f26a8529fe9db4c7a6d Mon Sep 17 00:00:00 2001 From: Kleis Auke Wolthuizen Date: Tue, 23 Mar 2021 16:47:53 +0100 Subject: [PATCH 4/4] Move max_band_vec to first position --- libvips/conversion/composite.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/libvips/conversion/composite.cpp b/libvips/conversion/composite.cpp index f92d82ec..267e0a56 100644 --- a/libvips/conversion/composite.cpp +++ b/libvips/conversion/composite.cpp @@ -163,10 +163,17 @@ vips_composite_base_dispose( GObject *gobject ) G_OBJECT_CLASS( vips_composite_base_parent_class )->dispose( gobject ); } -/* Our sequence value. This must be aligned on a 16-byte boundary when - * HAVE_VECTOR_ARITH is defined. +/* Our sequence value. */ typedef struct { +#ifdef HAVE_VECTOR_ARITH + /* max_band as a vector, for the RGBA case. This must be + * defined first to ensure that the member is aligned + * on a 16-byte boundary. + */ + v4f max_band_vec; +#endif /*HAVE_VECTOR_ARITH*/ + VipsCompositeBase *composite; /* Full set of input regions, each made on the corresponding input @@ -193,12 +200,6 @@ typedef struct { */ VipsPel **p; -#ifdef HAVE_VECTOR_ARITH - /* max_band as a vector, for the RGBA case. - */ - v4f max_band_vec; -#endif /*HAVE_VECTOR_ARITH*/ - } VipsCompositeSequence; #ifdef HAVE_VECTOR_ARITH