diff --git a/ChangeLog b/ChangeLog
index 18a4a9f6..b5ca2523 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -21,12 +21,14 @@
 - png and gif load note background colour as metadata [781545872]
 - add vips_image_[set|get]_array_double()
 - add GIF load with libnsgif
-- add JPEG2000 load and save
+- add jp2kload, jp2ksave
+- add jp2k compression to tiff load and save
 - add JPEG-XL load and save
 - add black_point_compensation flag for icc transforms
 - add "rgba" flag to vips_text() to enable full colour text rendering
 - move openslide, libheif, poppler and magick to loadable modules [kleisauke]
 - better detection of invalid ICC profiles, better fallback paths
+- add "premultiply" flag to tiffsave
 
 30/4/21 start 8.10.7
 - better vips7 PNG load compatibility [SkyDiverCool]
diff --git a/configure.ac b/configure.ac
index 76d14267..c533d219 100644
--- a/configure.ac
+++ b/configure.ac
@@ -840,10 +840,11 @@ AC_ARG_WITH([libopenjp2],
   AS_HELP_STRING([--without-libopenjp2], 
                  [build without libopenjp2 (default: test)]))
 
+# 2.4 is the first one to have working threading and tiling
 if test x"$with_libopenjp2" != x"no"; then
-  PKG_CHECK_MODULES(LIBOPENJP2, libopenjp2 >= 2.3,
+  PKG_CHECK_MODULES(LIBOPENJP2, libopenjp2 >= 2.4,
     [AC_DEFINE(HAVE_LIBOPENJP2,1,
-       [define if you have libopenjp2 >=2.2 installed.])
+       [define if you have libopenjp2 >=2.4 installed.])
      with_libopenjp2=yes
      PACKAGES_USED="$PACKAGES_USED libopenjp2"
     ],
@@ -851,14 +852,6 @@ if test x"$with_libopenjp2" != x"no"; then
      with_libopenjp2=no
     ]
   )
-
-  # 2.3 and earlier have threading problems
-  PKG_CHECK_MODULES(LIBOPENJP2_THREADING, libopenjp2 >= 2.4,
-    [AC_DEFINE(HAVE_LIBOPENJP2_THREADING,1,[define if your libopenjp2 threading works.])
-    ],
-    [:
-    ]
-  )
 fi
 
 VIPS_CFLAGS="$VIPS_CFLAGS $LIBOPENJP2_CFLAGS"
@@ -1505,6 +1498,7 @@ AC_CONFIG_FILES([
         libvips/histogram/Makefile
         libvips/draw/Makefile
         libvips/iofuncs/Makefile
+        libvips/module/Makefile
         libvips/morphology/Makefile
         libvips/mosaicing/Makefile
         libvips/create/Makefile
@@ -1559,7 +1553,7 @@ EXIF metadata support with libexif:     $with_libexif
 JPEG load/save with libjpeg:            $with_jpeg
 JXL load/save with libjxl:              $with_libjxl
 JPEG2000 load/save with libopenjp2:     $with_libopenjp2
- (requires libopenjp2 2.2 or later)
+ (requires libopenjp2 2.4 or later)
 PNG load with libspng:                  $with_libspng
  (requires libspng-0.6 or later)
 PNG load/save with libpng:              $with_png
@@ -1578,8 +1572,6 @@ PDF load with poppler-glib:             $with_poppler (dynamic module: $with_pop
 SVG load with librsvg-2.0:              $with_rsvg
  (requires librsvg-2.0 2.34.0 or later)
 EXR load with OpenEXR:                  $with_OpenEXR
-JPEG2000 load/save with libopenjp2:     $with_libopenjp2
- (requires libopenjp2 2.2 or later)
 slide load with OpenSlide:              $with_openslide (dynamic module: $with_openslide_module)
  (requires openslide-3.3.0 or later)
 Matlab load with matio:                 $with_matio
diff --git a/libvips/Makefile.am b/libvips/Makefile.am
index a6ebb41d..9345769b 100644
--- a/libvips/Makefile.am
+++ b/libvips/Makefile.am
@@ -27,6 +27,7 @@ SUBDIRS = \
 	draw \
 	iofuncs \
 	morphology \
+	module \
 	mosaicing \
 	create
 
@@ -61,7 +62,6 @@ libvips_la_LDFLAGS = \
 	-version-info @LIBRARY_CURRENT@:@LIBRARY_REVISION@:@LIBRARY_AGE@ 
 
 EXTRA_DIST = \
-	module \
 	$(OPTIONAL_DIST_DIR)
 
 CLEANFILES = 
diff --git a/libvips/foreign/Makefile.am b/libvips/foreign/Makefile.am
index 403a476a..5d6bc012 100644
--- a/libvips/foreign/Makefile.am
+++ b/libvips/foreign/Makefile.am
@@ -1,6 +1,4 @@
-if ENABLE_NSGIF
 SUBDIRS = libnsgif 
-endif
 
 noinst_LTLIBRARIES = libforeign.la
 
@@ -94,7 +92,5 @@ if !OPENSLIDE_MODULE
 libforeign_la_SOURCES += openslide2vips.c
 endif # !OPENSLIDE_MODULE
 
-EXTRA_DIST = libnsgif
-
 AM_CPPFLAGS = -I${top_srcdir}/libvips/include @VIPS_CFLAGS@ @VIPS_INCLUDES@
 
diff --git a/libvips/foreign/heifsave.c b/libvips/foreign/heifsave.c
index 700cd890..c81c515b 100644
--- a/libvips/foreign/heifsave.c
+++ b/libvips/foreign/heifsave.c
@@ -67,7 +67,7 @@
  * * @Q: %gint, quality factor
  * * @lossless: %gboolean, enable lossless encoding
  * * @compression: #VipsForeignHeifCompression, write with this compression
- * * @speed: %gint, CPU effort, 0 slowest - 8 fastest, AV1 compression only
+ * * @speed: %gint, encoding speed
  * * @subsample_mode: #VipsForeignSubsample, chroma subsampling mode
  *
  * Write a VIPS image to a file in HEIF format. 
@@ -81,7 +81,8 @@
  * if the target filename ends with ".avif", otherwise HEVC.
  *
  * Use @speed to control the CPU effort spent improving compression.
- * This is currently only applicable to AV1 encoders, defaults to 5.
+ * This is currently only applicable to AV1 encoders. Defaults to 5, 0 is
+ * slowest, 9 is fastest.
  *
  * Chroma subsampling is normally automatically disabled for Q >= 90. You can
  * force the subsampling mode with @subsample_mode.
@@ -115,7 +116,7 @@ vips_heifsave( VipsImage *in, const char *filename, ... )
  * * @Q: %gint, quality factor
  * * @lossless: %gboolean, enable lossless encoding
  * * @compression: #VipsForeignHeifCompression, write with this compression
- * * @speed: %gint, CPU effort, 0 slowest - 8 fastest, AV1 compression only
+ * * @speed: %gint, encoding speed
  * * @subsample_mode: #VipsForeignSubsample, chroma subsampling mode
  *
  * As vips_heifsave(), but save to a memory buffer. 
@@ -167,7 +168,7 @@ vips_heifsave_buffer( VipsImage *in, void **buf, size_t *len, ... )
  * * @Q: %gint, quality factor
  * * @lossless: %gboolean, enable lossless encoding
  * * @compression: #VipsForeignHeifCompression, write with this compression
- * * @speed: %gint, CPU effort, 0 slowest - 8 fastest, AV1 compression only
+ * * @speed: %gint, encoding speed
  * * @subsample_mode: #VipsForeignSubsample, chroma subsampling mode
  *
  * As vips_heifsave(), but save to a target.
diff --git a/libvips/foreign/jp2kload.c b/libvips/foreign/jp2kload.c
index f320aaba..0d5abfce 100644
--- a/libvips/foreign/jp2kload.c
+++ b/libvips/foreign/jp2kload.c
@@ -274,10 +274,6 @@ vips_foreign_load_jp2k_error_callback( const char *msg, void *client )
 
 	vips_error( class->nickname, "%s", msg ); 
 	jp2k->n_errors += 1;
-
-#ifdef DEBUG
-	printf( "%s: error %s",  class->nickname, msg );
-#endif /*DEBUG*/
 }
 
 /* The openjpeg info and warning callbacks are incredibly chatty.
@@ -500,12 +496,7 @@ vips_foreign_load_jp2k_header( VipsForeignLoad *load )
 	if( !opj_setup_decoder( jp2k->codec, &jp2k->parameters ) ) 
 		return( -1 );
 
-#ifdef HAVE_LIBOPENJP2_THREADING
-	/* Use eg. VIPS_CONCURRENCY etc. to set n-cpus, if this openjpeg has
-	 * stable support. 
-	 */
 	opj_codec_set_threads( jp2k->codec, vips_concurrency_get() );
-#endif /*HAVE_LIBOPENJP2_THREADING*/
 
 	if( !opj_read_header( jp2k->stream, jp2k->codec, &jp2k->image ) )
 		return( -1 );
@@ -586,7 +577,7 @@ vips_foreign_load_jp2k_header( VipsForeignLoad *load )
 	\
 	for( x = 0; x < length; x++ ) { \
 		for( i = 0; i < b; i++ ) { \
-			int dx = jp2k->image->comps[i].dx; \
+			int dx = image->comps[i].dx; \
 			int pixel = planes[i][x / dx]; \
 			\
 			tq[i] = pixel; \
@@ -596,28 +587,37 @@ vips_foreign_load_jp2k_header( VipsForeignLoad *load )
 	} \
 }
 
-/* Pack the set of openjpeg components into a libvips region. left/top are the
- * offsets into the tile in pixel coordinates where we should start reading.
+/* Pack a line of openjpeg pixels into libvips format. left/top are the
+ * offsets into the opj image in pixel coordinates where we should start 
+ * reading.
+ *
+ * Set upsample if any opj component is subsampled.
  */
 static void
-vips_foreign_load_jp2k_pack( VipsForeignLoadJp2k *jp2k, 
-	VipsImage *image, VipsPel *q, 
-	int left, int top, int length )
+vips_foreign_load_jp2k_pack( gboolean upsample, 
+	opj_image_t *image, VipsImage *im, 
+	VipsPel *q, int left, int top, int length )
 {
 	int *planes[MAX_BANDS];
-	int b = jp2k->image->numcomps;
+	int b = image->numcomps;
 
 	int x, i;
 
+#ifdef DEBUG_VERBOSE
+	printf( "vips_foreign_load_jp2k_pack: "
+		"upsample = %d, left = %d, top = %d, length = %d\n", 
+		upsample, left, top, length ); 
+#endif /*DEBUG_VERBOSE*/
+
 	for( i = 0; i < b; i++ ) {
-		opj_image_comp_t *comp = &jp2k->image->comps[i];
+		opj_image_comp_t *comp = &image->comps[i];
 
 		planes[i] = comp->data + (top / comp->dy) * comp->w + 
 			(left / comp->dx);
 	}
 
-	if( jp2k->upsample ) 
-		switch( image->BandFmt ) {
+	if( upsample ) 
+		switch( im->BandFmt ) {
 		case VIPS_FORMAT_CHAR:
 		case VIPS_FORMAT_UCHAR:
 			PACK_UPSAMPLE( unsigned char );
@@ -640,7 +640,7 @@ vips_foreign_load_jp2k_pack( VipsForeignLoadJp2k *jp2k,
 	else 
 		/* Fast no-upsample path.
 		 */
-		switch( image->BandFmt ) {
+		switch( im->BandFmt ) {
 		case VIPS_FORMAT_CHAR:
 		case VIPS_FORMAT_UCHAR:
 			PACK( unsigned char );
@@ -692,17 +692,16 @@ vips_foreign_load_jp2k_pack( VipsForeignLoadJp2k *jp2k,
 /* YCC->RGB for a line of pels.
  */
 static void
-vips_foreign_load_jp2k_ycc_to_rgb( VipsForeignLoadJp2k *jp2k, 
+vips_foreign_load_jp2k_ycc_to_rgb( opj_image_t *image, VipsImage *im, 
 	VipsPel *q, int length )
 {
-	VipsForeignLoad *load = (VipsForeignLoad *) jp2k;
-	int prec = jp2k->image->comps[0].prec;
+	int prec = image->comps[0].prec;
 	int offset = 1 << (prec - 1);
 	int upb = (1 << prec) - 1;
 
 	int x;
 
-	switch( load->out->BandFmt ) {
+	switch( im->BandFmt ) {
 	case VIPS_FORMAT_CHAR:
 	case VIPS_FORMAT_UCHAR:
 		YCC_TO_RGB( unsigned char );
@@ -806,15 +805,16 @@ vips_foreign_load_jp2k_generate( VipsRegion *out,
 				VipsPel *q = VIPS_REGION_ADDR( out, 
 					hit.left, hit.top + z );
 
-				vips_foreign_load_jp2k_pack( jp2k,
-					out->im, q,
+				vips_foreign_load_jp2k_pack( jp2k->upsample, 
+					jp2k->image, out->im, q,
 					hit.left - tile.left,
 					hit.top - tile.top + z,
 					hit.width ); 
 
 				if( jp2k->ycc_to_rgb )
-					vips_foreign_load_jp2k_ycc_to_rgb( jp2k,
-						q, hit.width );
+					vips_foreign_load_jp2k_ycc_to_rgb( 
+						jp2k->image, out->im, q, 
+						hit.width );
 			}
 
 			x += hit.width;
@@ -1132,6 +1132,142 @@ vips_foreign_load_jp2k_source_init(
 {
 }
 
+static void 
+warning_callback( const char *msg G_GNUC_UNUSED, void *data G_GNUC_UNUSED ) 
+{
+	/* There are a lot of warnings ...
+	 */
+}
+
+static void 
+error_callback( const char *msg, void *data ) 
+{
+	printf( "OpenJPEG: %s", msg ); 
+	vips_error( "OpenJPEG", "%s", msg ); 
+}
+
+typedef struct _TileDecompress {
+	VipsSource *source;
+        opj_stream_t *stream;
+        opj_codec_t *codec;
+	opj_image_t *image;
+} TileDecompress;
+
+static void
+vips__foreign_load_jp2k_decompress_free( TileDecompress *decompress )
+{
+	VIPS_FREEF( opj_destroy_codec, decompress->codec );
+	VIPS_FREEF( opj_image_destroy, decompress->image );
+	VIPS_FREEF( opj_stream_destroy, decompress->stream );
+	VIPS_UNREF( decompress->source );
+}
+
+/* Called from tiff2vips to decode a jp2k-compressed tile. 
+ *
+ * width/height is the tile size. If this is an edge tile, and smaller than 
+ * this, we still write a full-size tile and our caller will clip.
+ */
+int
+vips__foreign_load_jp2k_decompress( VipsImage *out, 
+	int width, int height, gboolean ycc_to_rgb, 
+	void *from, size_t from_length, 
+	void *to, size_t to_length )
+{
+	size_t pel_size = VIPS_IMAGE_SIZEOF_PEL( out );
+	size_t line_size = pel_size * width;
+
+	TileDecompress decompress = { 0 };
+	opj_dparameters_t parameters;
+	int i;
+	gboolean upsample;
+	VipsPel *q;
+	int y;
+
+#ifdef DEBUG
+	printf( "vips__foreign_load_jp2k_decompress: width = %d, height = %d, "
+		"ycc_to_rgb = %d, from_length = %zd, to_length = %zd\n",
+		width, height, ycc_to_rgb, from_length, to_length );
+#endif /*DEBUG*/
+
+	/* Our ycc->rgb only works for exactly 3 bands.
+	 */
+	ycc_to_rgb = ycc_to_rgb && out->Bands == 3;
+
+	decompress.codec = opj_create_decompress( OPJ_CODEC_J2K );
+	opj_set_default_decoder_parameters( &parameters );
+	opj_setup_decoder( decompress.codec, &parameters );
+	opj_set_warning_handler( decompress.codec, warning_callback, NULL );
+	opj_set_error_handler( decompress.codec, error_callback, NULL );
+
+	decompress.source = vips_source_new_from_memory( from, from_length );
+	decompress.stream = vips_foreign_load_jp2k_stream( decompress.source );
+	if( !opj_read_header( decompress.stream, 
+		decompress.codec, &decompress.image ) ) {
+		vips_error( "jp2kload", "%s", ( "header error" ) );
+		vips__foreign_load_jp2k_decompress_free( &decompress ); 
+		return( -1 );
+	}
+
+	if( decompress.image->x1 > width || 
+		decompress.image->y1 > height ||
+		line_size * height > to_length ) {
+		vips_error( "jp2kload", "%s", ( "bad dimensions" ) );
+		vips__foreign_load_jp2k_decompress_free( &decompress ); 
+    		return( -1 );
+	}
+
+	if( !opj_decode( decompress.codec, 
+		decompress.stream, decompress.image ) ) {
+		vips_error( "jp2kload", "%s", ( "decode error" ) );
+		vips__foreign_load_jp2k_decompress_free( &decompress ); 
+		return( -1 );
+	}
+
+	/* Do any components need upsampling?
+	 */
+	upsample = FALSE;
+	for( i = 0; i < decompress.image->numcomps; i++ ) {
+		opj_image_comp_t *this = &decompress.image->comps[i];
+
+		if( this->dx > 1 ||
+			this->dy > 1 )
+			upsample = TRUE;
+	}
+
+	/* Unpack hit pixels to buffer in vips layout. 
+	 */
+	q = to;
+	for( y = 0; y < height; y++ ) {
+		vips_foreign_load_jp2k_pack( upsample, 
+			decompress.image, out, q,
+			0, y, width ); 
+
+		if( ycc_to_rgb )
+			vips_foreign_load_jp2k_ycc_to_rgb( 
+				decompress.image, out, q, 
+				width );
+
+		q += line_size;
+	}
+
+	vips__foreign_load_jp2k_decompress_free( &decompress ); 
+
+	return( 0 );
+}
+
+#else /*!HAVE_LIBOPENJP2*/
+
+int
+vips__foreign_load_jp2k_decompress( VipsImage *out, 
+	int width, int height, gboolean ycc_to_rgb, 
+	void *from, size_t from_length, 
+	void *to, size_t to_length )
+{
+	vips_error( "jp2k", 
+		"%s", _( "libvips built without JPEG2000 support" ) );
+	return( -1 );
+}
+
 #endif /*HAVE_LIBOPENJP2*/
 
 /**
diff --git a/libvips/foreign/jp2ksave.c b/libvips/foreign/jp2ksave.c
index 866f0b1c..ac6b83c3 100644
--- a/libvips/foreign/jp2ksave.c
+++ b/libvips/foreign/jp2ksave.c
@@ -42,6 +42,8 @@
  *
  * - could support png-like bitdepth parameter
  *
+ * - could support cp_comment field? not very useful
+ *
  */
 
 #ifdef HAVE_CONFIG_H
@@ -93,7 +95,6 @@ typedef struct _VipsForeignSaveJp2k {
 	opj_stream_t *stream;
 	opj_codec_t *codec;
 	opj_cparameters_t parameters;
-	opj_image_cmptparm_t comps[MAX_BANDS];
 	opj_image_t *image;
 
 	/* The line of tiles we are building, and the buffer we
@@ -102,9 +103,9 @@ typedef struct _VipsForeignSaveJp2k {
 	VipsRegion *strip;
 	VipsPel *tile_buffer;
 
-	/* If we need to downsample during unpacking.
+	/* If we need to subsample during unpacking.
 	 */
-	gboolean downsample;
+	gboolean subsample;
 
 	/* If we converto RGB to YCC during save.
 	 */
@@ -172,9 +173,7 @@ vips_foreign_save_jp2k_target( VipsTarget *target )
 static void 
 vips_foreign_save_jp2k_error_callback( const char *msg, void *client )
 {
-	VipsObjectClass *class = VIPS_OBJECT_GET_CLASS( client );
-
-	vips_error( class->nickname, "%s", msg ); 
+	vips_error( "jp2ksave", "%s", msg ); 
 }
 
 /* The openjpeg info and warning callbacks are incredibly chatty.
@@ -183,9 +182,7 @@ static void
 vips_foreign_save_jp2k_warning_callback( const char *msg, void *client )
 {
 #ifdef DEBUG
-	VipsObjectClass *class = VIPS_OBJECT_GET_CLASS( client );
-
-	g_warning( "%s: %s",  class->nickname, msg );
+	g_warning( "jp2ksave: %s",  class->nickname, msg );
 #endif /*DEBUG*/
 }
 
@@ -195,22 +192,19 @@ static void
 vips_foreign_save_jp2k_info_callback( const char *msg, void *client )
 {
 #ifdef DEBUG
-	VipsObjectClass *class = VIPS_OBJECT_GET_CLASS( client );
-
-	g_info( "%s: %s",  class->nickname, msg );
+	g_info( "jp2ksave: %s",  class->nickname, msg );
 #endif /*DEBUG*/
 }
 
 static void
-vips_foreign_save_jp2k_attach_handlers( VipsForeignSaveJp2k *jp2k,
-	opj_codec_t *codec )
+vips_foreign_save_jp2k_attach_handlers( opj_codec_t *codec )
 {
-	opj_set_info_handler( codec, 
-		vips_foreign_save_jp2k_info_callback, jp2k );
+	opj_set_info_handler( codec,
+		vips_foreign_save_jp2k_info_callback, NULL );
 	opj_set_warning_handler( codec, 
-		vips_foreign_save_jp2k_warning_callback, jp2k );
+		vips_foreign_save_jp2k_warning_callback, NULL );
 	opj_set_error_handler( codec, 
-		vips_foreign_save_jp2k_error_callback, jp2k );
+		vips_foreign_save_jp2k_error_callback, NULL );
 }
 
 /* See also https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion
@@ -238,23 +232,25 @@ vips_foreign_save_jp2k_attach_handlers( VipsForeignSaveJp2k *jp2k,
 	} \
 }
 
-/* RGB->YCC for a line of pels.
+/* In-place RGB->YCC for a line of pels.
  */
 static void
-vips_foreign_save_jp2k_rgb_to_ycc( VipsForeignSaveJp2k *jp2k, VipsRect *tile )
+vips_foreign_save_jp2k_rgb_to_ycc( VipsRegion *region, 
+	VipsRect *tile, int prec ) 
 {
-	VipsForeignSave *save = (VipsForeignSave *) jp2k;
-	int prec = jp2k->image->comps[0].prec;
+	VipsImage *im = region->im;
 	int offset = 1 << (prec - 1);
 	int upb = (1 << prec) - 1;
 
 	int x, y;
 
+	g_assert( im->Bands == 3 );
+
 	for( y = 0; y < tile->height; y++ ) {
-		VipsPel *q = VIPS_REGION_ADDR( jp2k->strip, 
+		VipsPel *q = VIPS_REGION_ADDR( region, 
 			tile->left, tile->top + y );
 
-		switch( save->ready->BandFmt ) {
+		switch( im->BandFmt ) {
 		case VIPS_FORMAT_CHAR:
 		case VIPS_FORMAT_UCHAR:
 			RGB_TO_YCC( unsigned char );
@@ -282,9 +278,9 @@ vips_foreign_save_jp2k_rgb_to_ycc( VipsForeignSaveJp2k *jp2k, VipsRect *tile )
  *   2. add subsequent lines in comp.dy.
  *   3. horizontal average to output line
  */
-#define SHRINK( ACC_TYPE, PIXEL_TYPE ) { \
-	ACC_TYPE *acc = (ACC_TYPE *) jp2k->accumulate; \
-	PIXEL_TYPE *tq = (PIXEL_TYPE *) q; \
+#define SHRINK( OUTPUT_TYPE, ACC_TYPE, PIXEL_TYPE ) { \
+	ACC_TYPE *acc = (ACC_TYPE *) accumulate; \
+	OUTPUT_TYPE *tq = (OUTPUT_TYPE *) q; \
 	const int n_pels = comp->dx * comp->dy; \
 	\
 	PIXEL_TYPE *tp; \
@@ -318,23 +314,23 @@ vips_foreign_save_jp2k_rgb_to_ycc( VipsForeignSaveJp2k *jp2k, VipsRect *tile )
 }
 
 static void
-vips_foreign_save_jp2k_unpack_downsample( VipsForeignSaveJp2k *jp2k, 
-	VipsRect *tile )
+vips_foreign_save_jp2k_unpack_subsample( VipsRegion *region, VipsRect *tile,
+	opj_image_t *image, VipsPel *tile_buffer, VipsPel *accumulate )
 {
-	VipsForeignSave *save = (VipsForeignSave *) jp2k;
-	size_t sizeof_element = VIPS_IMAGE_SIZEOF_ELEMENT( save->ready );
-	size_t lskip = VIPS_REGION_LSKIP( jp2k->strip );
-	int n_bands = save->ready->Bands;
+	VipsImage *im = region->im;
+	size_t sizeof_element = VIPS_REGION_SIZEOF_ELEMENT( region );
+	size_t lskip = VIPS_REGION_LSKIP( region );
+	int n_bands = im->Bands;
 
 	VipsPel *q;
-	int x, y, z, band_index;
+	int x, y, z, i;
 
-	q = jp2k->tile_buffer;
-	for( band_index = 0; band_index < n_bands; band_index++ ) {
-		opj_image_comp_t *comp = &jp2k->image->comps[band_index];
+	q = tile_buffer;
+	for( i = 0; i < n_bands; i++ ) {
+		opj_image_comp_t *comp = &image->comps[i];
 
-		/* The number of pixels we write for this component. Round to
-		 * nearest, and we may have to write half-pixels at the edges.
+		/* The number of pixels we write for this component. No
+		 * padding.
 		 */
 		int output_width = VIPS_ROUND_UINT( 
 			(double) tile->width / comp->dx );
@@ -342,35 +338,35 @@ vips_foreign_save_jp2k_unpack_downsample( VipsForeignSaveJp2k *jp2k,
 			(double) tile->height / comp->dy );;
 
 		for( y = 0; y < output_height; y++ ) {
-			VipsPel *p = band_index * sizeof_element + 
-				VIPS_REGION_ADDR( jp2k->strip, 
+			VipsPel *p = i * sizeof_element + 
+				VIPS_REGION_ADDR( region, 
 					tile->left, tile->top + y * comp->dy );
 
 			/* Shrink a line of pels to q.
 			 */
-			switch( save->ready->BandFmt ) {
+			switch( im->BandFmt ) {
 			case VIPS_FORMAT_CHAR:
-				SHRINK( int, signed char );
+				SHRINK( signed char, int, signed char );
 				break;
 
 			case VIPS_FORMAT_UCHAR:
-				SHRINK( int, unsigned char );
+				SHRINK( unsigned char, int, unsigned char );
 				break;
 
 			case VIPS_FORMAT_SHORT:
-				SHRINK( int, signed short );
+				SHRINK( signed short, int, signed short );
 				break;
 
 			case VIPS_FORMAT_USHORT:
-				SHRINK( int, unsigned short );
+				SHRINK( unsigned short, int, unsigned short );
 				break;
 
 			case VIPS_FORMAT_INT:
-				SHRINK( gint64, signed int );
+				SHRINK( signed int, gint64, signed int );
 				break;
 
 			case VIPS_FORMAT_UINT:
-				SHRINK( gint64, unsigned int );
+				SHRINK( unsigned int, gint64, unsigned int );
 				break;
 
 			default:
@@ -383,60 +379,56 @@ vips_foreign_save_jp2k_unpack_downsample( VipsForeignSaveJp2k *jp2k,
 	}
 }
 
-#define UNPACK( TYPE ) { \
-	TYPE **tplanes = (TYPE **) planes; \
-	TYPE *tp = (TYPE *) p; \
+#define UNPACK( OUT, IN ) { \
+	OUT *tq = (OUT *) q; \
+	IN *tp = (IN *) p + i; \
 	\
-	for( i = 0; i < b; i++ ) { \
-		TYPE *q = tplanes[i]; \
-		TYPE *tp1 = tp + i; \
-		\
-		for( x = 0; x < tile->width; x++ ) { \
-			q[x] = *tp1; \
-			tp1 += b; \
-		} \
-		\
-		tplanes[i] += tile->width; \
+	for( x = 0; x < tile->width; x++ ) { \
+		tq[x] = *tp; \
+		tp += b; \
 	} \
 }
 
 static void
-vips_foreign_save_jp2k_unpack( VipsForeignSaveJp2k *jp2k, VipsRect *tile )
+vips_foreign_save_jp2k_unpack( VipsRegion *region, VipsRect *tile,
+	opj_image_t *image, VipsPel *tile_buffer )
 {
-	VipsForeignSave *save = (VipsForeignSave *) jp2k;
-	size_t sizeof_element = VIPS_IMAGE_SIZEOF_ELEMENT( save->ready );
-	int b = save->ready->Bands;
+	VipsImage *im = region->im;
+	size_t sizeof_element = VIPS_REGION_SIZEOF_ELEMENT( region );
+	size_t sizeof_line = sizeof_element * tile->width;
+	size_t sizeof_tile = sizeof_line * tile->height;
+	int b = im->Bands;
 
-	VipsPel *planes[MAX_BANDS];
 	int x, y, i;
 
-	for( i = 0; i < b; i++ )
-		planes[i] = jp2k->tile_buffer +
-			i * sizeof_element * tile->width * tile->height;
-
 	for( y = 0; y < tile->height; y++ ) {
-		VipsPel *p = VIPS_REGION_ADDR( jp2k->strip, 
+		VipsPel *p = VIPS_REGION_ADDR( region, 
 			tile->left, tile->top + y );
 
-		switch( save->ready->BandFmt ) {
-		case VIPS_FORMAT_CHAR:
-		case VIPS_FORMAT_UCHAR:
-			UNPACK( unsigned char );
-			break;
+		for( i = 0; i < b; i++ ) {
+			VipsPel *q = tile_buffer + 
+				i * sizeof_tile + y * sizeof_line;
 
-		case VIPS_FORMAT_SHORT:
-		case VIPS_FORMAT_USHORT:
-			UNPACK( unsigned short );
-			break;
+			switch( im->BandFmt ) {
+			case VIPS_FORMAT_CHAR:
+			case VIPS_FORMAT_UCHAR:
+				UNPACK( unsigned char, unsigned char );
+				break;
 
-		case VIPS_FORMAT_INT:
-		case VIPS_FORMAT_UINT:
-			UNPACK( unsigned int );
-			break;
+			case VIPS_FORMAT_SHORT:
+			case VIPS_FORMAT_USHORT:
+				UNPACK( unsigned short, unsigned short );
+				break;
 
-		default:
-			g_assert_not_reached();
-			break;
+			case VIPS_FORMAT_INT:
+			case VIPS_FORMAT_UINT:
+				UNPACK( unsigned int, unsigned int );
+				break;
+
+			default:
+				g_assert_not_reached();
+				break;
+			}
 		}
 	}
 }
@@ -472,13 +464,13 @@ static int
 vips_foreign_save_jp2k_write_tiles( VipsForeignSaveJp2k *jp2k )
 {
 	VipsForeignSave *save = (VipsForeignSave *) jp2k;
-	int tiles_across = 
-		VIPS_ROUND_UP( save->ready->Xsize, jp2k->tile_width ) /
-			jp2k->tile_width;
+	VipsImage *im = save->ready;
+	int tiles_across = VIPS_ROUND_UP( im->Xsize, jp2k->tile_width ) /
+		jp2k->tile_width;
 
 	int x;
 
-	for( x = 0; x < save->ready->Xsize; x += jp2k->tile_width ) {
+	for( x = 0; x < im->Xsize; x += jp2k->tile_width ) {
 		VipsRect tile;
 		size_t sizeof_tile;
 		int tile_index;
@@ -490,14 +482,20 @@ vips_foreign_save_jp2k_write_tiles( VipsForeignSaveJp2k *jp2k )
 		vips_rect_intersectrect( &tile, &jp2k->strip->valid, &tile );
 
 		if( jp2k->save_as_ycc ) 
-			vips_foreign_save_jp2k_rgb_to_ycc( jp2k, &tile );
+			vips_foreign_save_jp2k_rgb_to_ycc( jp2k->strip, 
+				&tile, jp2k->image->comps[0].prec ); 
 
-		if( jp2k->downsample )
-			vips_foreign_save_jp2k_unpack_downsample( jp2k, &tile );
+		if( jp2k->subsample )
+			vips_foreign_save_jp2k_unpack_subsample( jp2k->strip, 
+				&tile, jp2k->image, 
+				jp2k->tile_buffer, jp2k->accumulate ); 
 		else
-			vips_foreign_save_jp2k_unpack( jp2k, &tile );
+			vips_foreign_save_jp2k_unpack( jp2k->strip, 
+				&tile, jp2k->image, 
+				jp2k->tile_buffer );
 
-		sizeof_tile = vips_foreign_save_jp2k_sizeof_tile( jp2k, &tile );
+		sizeof_tile = 
+			vips_foreign_save_jp2k_sizeof_tile( jp2k, &tile );
 		tile_index = tiles_across * tile.top / jp2k->tile_height +
 			x / jp2k->tile_width;
 		if( !opj_write_tile( jp2k->codec, tile_index, 
@@ -550,7 +548,7 @@ vips_foreign_save_jp2k_write_block( VipsRegion *region, VipsRect *area,
 			break;
 
 		/* We have reached the bottom of the strip. Write this line of
-		 * pixels and ove the strip down.
+		 * pixels and move the strip down.
 		 */
 		if( vips_foreign_save_jp2k_write_tiles( jp2k ) )
 			return( -1 );
@@ -566,67 +564,90 @@ vips_foreign_save_jp2k_write_block( VipsRegion *region, VipsRect *area,
 	return( 0 );
 }
 
-static int
-vips_foreign_save_jp2k_build( VipsObject *object )
-{
-	VipsObjectClass *class = VIPS_OBJECT_GET_CLASS( object );
-	VipsForeignSave *save = (VipsForeignSave *) object;
-	VipsForeignSaveJp2k *jp2k = (VipsForeignSaveJp2k *) object;
+/* We can't call opj_calloc on win, sadly.
+ */
+#define VIPS_OPJ_CALLOC( N, TYPE ) \
+	((TYPE *) calloc( (N), sizeof( TYPE ) ))
 
+/* Allocate an openjpeg image structure. Openjpeg has opj_image_create(), but
+ * that always allocates memory for each channel, and we don't want that when
+ * we are doing tiled write.
+ */
+static opj_image_t * 
+vips_opj_image_create( OPJ_UINT32 numcmpts,
+	opj_image_cmptparm_t *cmptparms, OPJ_COLOR_SPACE clrspc,
+	gboolean allocate )
+{
+	OPJ_UINT32 compno;
+	opj_image_t *image = NULL;
+
+	if( !(image = VIPS_OPJ_CALLOC( 1, opj_image_t )) )
+		return( NULL );
+
+        image->color_space = clrspc;
+        image->numcomps = numcmpts;
+        image->comps = VIPS_OPJ_CALLOC( image->numcomps, opj_image_comp_t );
+        if( !image->comps ) {
+            opj_image_destroy( image );
+            return( NULL );
+        }
+
+        for( compno = 0; compno < numcmpts; compno++ ) {
+		opj_image_comp_t *comp = &image->comps[compno];
+
+		comp->dx = cmptparms[compno].dx;
+		comp->dy = cmptparms[compno].dy;
+		comp->w = cmptparms[compno].w;
+		comp->h = cmptparms[compno].h;
+		comp->x0 = cmptparms[compno].x0;
+		comp->y0 = cmptparms[compno].y0;
+		comp->prec = cmptparms[compno].prec;
+		comp->bpp = cmptparms[compno].bpp;
+		comp->sgnd = cmptparms[compno].sgnd;
+
+		if( comp->h != 0 &&
+			(OPJ_SIZE_T) comp->w > SIZE_MAX / comp->h / 
+				sizeof( OPJ_INT32 ) ) {
+			opj_image_destroy( image );
+			return( NULL );
+		}
+
+		/* Allocation is optional.
+		 */
+		if( allocate ) {
+			size_t bytes = (size_t) comp->w * comp->h * 
+                                sizeof( OPJ_INT32 );
+
+			comp->data = (OPJ_INT32*) opj_image_data_alloc( bytes );
+			if( !comp->data ) {
+				opj_image_destroy( image );
+				return NULL;
+			}
+			memset( comp->data, 0, bytes );
+		}
+	}
+
+	return( image );
+}
+
+static opj_image_t *
+vips_foreign_save_jp2k_new_image( VipsImage *im, 
+	int width, int height, 
+	gboolean subsample, gboolean save_as_ycc, gboolean allocate )
+{
 	OPJ_COLOR_SPACE color_space;
 	int expected_bands;
 	int bits_per_pixel;
+	opj_image_cmptparm_t comps[MAX_BANDS];
+	opj_image_t *image;
 	int i;
-	size_t sizeof_tile;
-	size_t sizeof_line;
-	VipsRect strip_position;
 
-	if( VIPS_OBJECT_CLASS( vips_foreign_save_jp2k_parent_class )->
-		build( object ) )
-		return( -1 );
-
-	opj_set_default_encoder_parameters( &jp2k->parameters );
-
-	/* Analyze our arguments.
-	 */
-
-	if( !vips_band_format_isint( save->ready->BandFmt ) ) {
-		vips_error( class->nickname,
-			"%s", _( "not an integer format" ) );
-		return( -1 );
-	}
-
-	switch( jp2k->subsample_mode ) {
-	case VIPS_FOREIGN_SUBSAMPLE_AUTO:
-		jp2k->downsample =
-			!jp2k->lossless &&
-			jp2k->Q < 90 &&
-			save->ready->Xsize % 2 == 0 &&
-			save->ready->Ysize % 2 == 0 &&
-			(save->ready->Type == VIPS_INTERPRETATION_sRGB ||
-			 save->ready->Type == VIPS_INTERPRETATION_RGB16) &&
-			save->ready->Bands == 3;
-		break;
-
-	case VIPS_FOREIGN_SUBSAMPLE_ON:
-		jp2k->downsample = TRUE;
-		break;
-
-	case VIPS_FOREIGN_SUBSAMPLE_OFF:
-		jp2k->downsample = FALSE;
-		break;
-
-	default:
-		g_assert_not_reached();
-		break;
-	}
-
-	if( jp2k->downsample ) 
-		jp2k->save_as_ycc = TRUE;
+	if( im->Bands > MAX_BANDS )
+		return( NULL );
 
 	/* CIELAB etc. do not seem to be well documented.
 	 */
-	switch( save->ready->Type ) {
+	switch( im->Type ) {
 	case VIPS_INTERPRETATION_B_W:
 	case VIPS_INTERPRETATION_GREY16:
 		color_space = OPJ_CLRSPC_GRAY;
@@ -635,8 +656,7 @@ vips_foreign_save_jp2k_build( VipsObject *object )
 
 	case VIPS_INTERPRETATION_sRGB:
 	case VIPS_INTERPRETATION_RGB16:
-		color_space = jp2k->save_as_ycc ? 
-			OPJ_CLRSPC_SYCC : OPJ_CLRSPC_SRGB;
+		color_space = save_as_ycc ? OPJ_CLRSPC_SYCC : OPJ_CLRSPC_SRGB;
 		expected_bands = 3;
 		break;
 
@@ -647,11 +667,11 @@ vips_foreign_save_jp2k_build( VipsObject *object )
 
 	default:
 		color_space = OPJ_CLRSPC_UNSPECIFIED;
-		expected_bands = save->ready->Bands;
+		expected_bands = im->Bands;
 		break;
 	}
 
-	switch( save->ready->BandFmt ) {
+	switch( im->BandFmt ) {
 	case VIPS_FORMAT_CHAR:
 	case VIPS_FORMAT_UCHAR:
 		bits_per_pixel = 8;
@@ -674,8 +694,135 @@ vips_foreign_save_jp2k_build( VipsObject *object )
 		break;
 	}
 
+	for( i = 0; i < im->Bands; i++ ) {
+		comps[i].dx = (subsample && i > 0) ? 2 : 1;
+		comps[i].dy = (subsample && i > 0) ? 2 : 1;
+		comps[i].w = width;
+		comps[i].h = height;
+		comps[i].x0 = 0;
+		comps[i].y0 = 0;
+		comps[i].prec = bits_per_pixel;
+		comps[i].bpp = bits_per_pixel;
+		comps[i].sgnd = !vips_band_format_isuint( im->BandFmt );
+	}
+
+	image = vips_opj_image_create( im->Bands, comps, color_space, 
+		allocate );
+	image->x1 = width;
+	image->y1 = height;
+
+	/* Tag alpha channels.
+	 */
+	for( i = 0; i < im->Bands; i++ )
+		image->comps[i].alpha = i >= expected_bands;
+
+	return( image );
+}
+
+/* Compression profile derived from the BM's recommenadations, see:
+ *
+ * https://purl.pt/24107/1/iPres2013_PDF/An%20Analysis%20of%20Contemporary%20JPEG2000%20Codecs%20for%20Image%20Format%20Migration.pdf
+ *
+ * Some of these settings (eg. numresolution) are overridden later.
+ */
+static void
+vips_foreign_save_jp2k_set_profile( opj_cparameters_t *parameters, 
+	gboolean lossless, int Q )
+{
+	if( lossless )
+		parameters->irreversible = FALSE;
+	else {
+		int i;
+
+		/* Equivalent command-line flags:
+		 *
+		 *   -I -p RPCL -n 7 \
+		 *   	-c[256,256],[256,256],[256,256],[256,256],[256,256],[256,256],[256,256] \
+		 *   	-b 64,64
+		 */
+
+		parameters->irreversible = TRUE;
+		parameters->prog_order = OPJ_RPCL;
+		parameters->cblockw_init = 64;
+		parameters->cblockh_init = 64;
+		parameters->cp_disto_alloc = 1;
+		parameters->cp_fixed_quality = TRUE;
+		parameters->tcp_numlayers = 1;
+		parameters->numresolution = 7;
+
+		/* No idea what this does, but opj_compress sets it.
+		 */
+		parameters->csty = 1;
+
+		parameters->res_spec = 7;
+		for( i = 0; i < parameters->res_spec; i++ ) { 
+			parameters->prch_init[i] = 256;
+			parameters->prcw_init[i] = 256;
+			parameters->tcp_distoratio[i] = Q + 10 * i;
+		}
+	}
+}
+
+static int
+vips_foreign_save_jp2k_build( VipsObject *object )
+{
+	VipsObjectClass *class = VIPS_OBJECT_GET_CLASS( object );
+	VipsForeignSave *save = (VipsForeignSave *) object;
+	VipsForeignSaveJp2k *jp2k = (VipsForeignSaveJp2k *) object;
+
+	size_t sizeof_tile;
+	size_t sizeof_line;
+	VipsRect strip_position;
+
+	if( VIPS_OBJECT_CLASS( vips_foreign_save_jp2k_parent_class )->
+		build( object ) )
+		return( -1 );
+
+	/* Analyze our arguments.
+	 */
+
+	if( !vips_band_format_isint( save->ready->BandFmt ) ) {
+		vips_error( class->nickname,
+			"%s", _( "not an integer format" ) );
+		return( -1 );
+	}
+
+	switch( jp2k->subsample_mode ) {
+	case VIPS_FOREIGN_SUBSAMPLE_AUTO:
+		jp2k->subsample =
+			!jp2k->lossless &&
+			jp2k->Q < 90 &&
+			save->ready->Xsize % 2 == 0 &&
+			save->ready->Ysize % 2 == 0 &&
+			(save->ready->Type == VIPS_INTERPRETATION_sRGB ||
+			 save->ready->Type == VIPS_INTERPRETATION_RGB16) &&
+			save->ready->Bands == 3;
+		break;
+
+	case VIPS_FOREIGN_SUBSAMPLE_ON:
+		jp2k->subsample = TRUE;
+		break;
+
+	case VIPS_FOREIGN_SUBSAMPLE_OFF:
+		jp2k->subsample = FALSE;
+		break;
+
+	default:
+		g_assert_not_reached();
+		break;
+	}
+
+	if( jp2k->subsample ) 
+		jp2k->save_as_ycc = TRUE;
+
 	/* Set parameters for compressor.
 	 */ 
+	opj_set_default_encoder_parameters( &jp2k->parameters );
+
+	/* Set compression profile.
+	 */
+	vips_foreign_save_jp2k_set_profile( &jp2k->parameters, 
+		jp2k->lossless, jp2k->Q ); 
 
 	/* Always tile.
 	 */
@@ -683,6 +830,10 @@ vips_foreign_save_jp2k_build( VipsObject *object )
 	jp2k->parameters.cp_tdx = jp2k->tile_width;
 	jp2k->parameters.cp_tdy = jp2k->tile_height;
 
+	/* Makes three band images smaller, somehow.
+	 */
+	jp2k->parameters.tcp_mct = save->ready->Bands >= 3 ? 1 : 0;
+
 	/* Number of layers to write. Smallest layer is c. 2^5 on the smallest
 	 * axis.
 	 */
@@ -694,69 +845,27 @@ vips_foreign_save_jp2k_build( VipsObject *object )
 		jp2k->parameters.numresolution );
 #endif /*DEBUG*/
 
-	for( i = 0; i < save->ready->Bands; i++ ) {
-		jp2k->comps[i].dx = (jp2k->downsample && i > 0) ? 2 : 1;
-		jp2k->comps[i].dy = (jp2k->downsample && i > 0) ? 2 : 1;
-		jp2k->comps[i].w = save->ready->Xsize;
-		jp2k->comps[i].h = save->ready->Ysize;
-		jp2k->comps[i].x0 = 0;
-		jp2k->comps[i].y0 = 0;
-		jp2k->comps[i].prec = bits_per_pixel;
-		jp2k->comps[i].bpp = bits_per_pixel;
-		jp2k->comps[i].sgnd = 
-			!vips_band_format_isuint( save->ready->BandFmt );
-	}
-
-	/* Makes three band images smaller, somehow.
-	 */
-	jp2k->parameters.tcp_mct = 
-		(save->ready->Bands == 3 && !jp2k->downsample) ? 1 : 0;
-
-	/* Lossy mode.
-	 */
-	if( !jp2k->lossless ) {
-		jp2k->parameters.irreversible = TRUE;
-
-		/* Map Q to allowed distortion.
-		 */
-		jp2k->parameters.cp_disto_alloc = 1;
-		jp2k->parameters.cp_fixed_quality = TRUE;
-		jp2k->parameters.tcp_distoratio[0] = jp2k->Q;
-		jp2k->parameters.tcp_numlayers = 1;
-	}
-
-	/* Create output image.
-	 */
-
-	jp2k->image = opj_image_create( save->ready->Bands, 
-		jp2k->comps, color_space );
-	jp2k->image->x1 = save->ready->Xsize;
-	jp2k->image->y1 = save->ready->Ysize;
-
-	/* Tag alpha channels.
-	 */
-	for( i = 0; i < save->ready->Bands; i++ )
-		jp2k->image->comps[i].alpha = i >= expected_bands;
-
 	/* Set up compressor.
 	 */
 
 	jp2k->codec = opj_create_compress( OPJ_CODEC_J2K );
-	vips_foreign_save_jp2k_attach_handlers( jp2k, jp2k->codec );
+	vips_foreign_save_jp2k_attach_handlers( jp2k->codec );
+	/* FALSE means don't alloc memory for image planes (we write in 
+	 * tiles, not whole images).
+	 */
+	if( !(jp2k->image = vips_foreign_save_jp2k_new_image( save->ready,
+		save->ready->Xsize, save->ready->Ysize, 
+		jp2k->subsample, jp2k->save_as_ycc, FALSE )) )
+		return( -1 );
         if( !opj_setup_encoder( jp2k->codec, &jp2k->parameters, jp2k->image ) ) 
 		return( -1 );
 
-#ifdef HAVE_LIBOPENJP2_THREADING
-	/* Use eg. VIPS_CONCURRENCY etc. to set n-cpus, if this openjpeg has
-	 * stable support. 
-	 */
 	opj_codec_set_threads( jp2k->codec, vips_concurrency_get() );
-#endif /*HAVE_LIBOPENJP2_THREADING*/
 
 	if( !(jp2k->stream = vips_foreign_save_jp2k_target( jp2k->target )) )
 		return( -1 );
 
-	if( !opj_start_compress( jp2k->codec, jp2k->image,  jp2k->stream ) )
+	if( !opj_start_compress( jp2k->codec, jp2k->image, jp2k->stream ) )
 		return( -1 );
 
 	/* The buffer we repack tiles to for write. Large enough for one
@@ -813,7 +922,7 @@ vips_foreign_save_jp2k_class_init( VipsForeignSaveJp2kClass *class )
 	gobject_class->get_property = vips_object_get_property;
 
 	object_class->nickname = "jp2ksave_base";
-	object_class->description = _( "save image in HEIF format" );
+	object_class->description = _( "save image in JPEG2000 format" );
 	object_class->build = vips_foreign_save_jp2k_build;
 
 	foreign_class->suffs = vips__jp2k_suffs;
@@ -854,7 +963,7 @@ vips_foreign_save_jp2k_class_init( VipsForeignSaveJp2kClass *class )
 		_( "Q factor" ),
 		VIPS_ARGUMENT_OPTIONAL_INPUT,
 		G_STRUCT_OFFSET( VipsForeignSaveJp2k, Q ),
-		1, 100, 45 );
+		1, 100, 48 );
 
 }
 
@@ -864,9 +973,9 @@ vips_foreign_save_jp2k_init( VipsForeignSaveJp2k *jp2k )
 	jp2k->tile_width = 512;
 	jp2k->tile_height = 512;
 
-	/* 45 gives about the same filesize as default regular jpg.
+	/* Chosen to give about the same filesize as regular jpg Q75.
 	 */
-	jp2k->Q = 45;
+	jp2k->Q = 48;
 
 	jp2k->subsample_mode = VIPS_FOREIGN_SUBSAMPLE_AUTO;
 }
@@ -1048,6 +1157,248 @@ vips_foreign_save_jp2k_target_init( VipsForeignSaveJp2kTarget *target )
 {
 }
 
+/* Stuff we track during tile compress.
+ */
+typedef struct _TileCompress {
+        opj_codec_t *codec;
+	opj_image_t *image;
+	opj_stream_t *stream;
+	VipsPel *accumulate;
+} TileCompress;
+
+/* Unpack from @tile within @region to the int data pointers on @image with
+ * subsampling.
+ */
+static void
+vips_foreign_save_jp2k_unpack_subsample_image( VipsRegion *region, 
+	VipsRect *tile, opj_image_t *image, VipsPel *accumulate )
+{
+	VipsImage *im = region->im;
+	size_t sizeof_element = VIPS_REGION_SIZEOF_ELEMENT( region );
+	size_t lskip = VIPS_REGION_LSKIP( region );
+	int n_bands = im->Bands;
+
+	int x, y, z, i;
+
+	for( i = 0; i < n_bands; i++ ) {
+		opj_image_comp_t *comp = &image->comps[i];
+		int *q = comp->data;
+
+		/* The number of pixels we write for this component. Lines
+		 * align to scanlines on comp.
+		 */
+		int output_width = VIPS_ROUND_UINT( 
+			(double) comp->w / comp->dx );
+		int output_height = VIPS_ROUND_UINT( 
+			(double) comp->h / comp->dy );
+
+		for( y = 0; y < output_height; y++ ) {
+			VipsPel *p = i * sizeof_element + 
+				VIPS_REGION_ADDR( region, 
+					tile->left, tile->top + y * comp->dy );
+
+			/* Shrink a line of pels to q.
+			 */
+			switch( im->BandFmt ) {
+			case VIPS_FORMAT_CHAR:
+				SHRINK( int, int, signed char );
+				break;
+
+			case VIPS_FORMAT_UCHAR:
+				SHRINK( int, int, unsigned char );
+				break;
+
+			case VIPS_FORMAT_SHORT:
+				SHRINK( int, int, signed short );
+				break;
+
+			case VIPS_FORMAT_USHORT:
+				SHRINK( int, int, unsigned short );
+				break;
+
+			case VIPS_FORMAT_INT:
+				SHRINK( int, gint64, signed int );
+				break;
+
+			case VIPS_FORMAT_UINT:
+				SHRINK( int, gint64, unsigned int );
+				break;
+
+			default:
+				g_assert_not_reached();
+				break;
+			}
+
+			q += output_width;
+		}
+	}
+}
+
+/* Unpack from @tile within @region to the int data pointers on @image. No
+ * subsampling.
+ */
+static void
+vips_foreign_save_jp2k_unpack_image( VipsRegion *region, VipsRect *tile,
+	opj_image_t *image )
+{
+	VipsImage *im = region->im;
+	int b = im->Bands;
+
+	int x, y, i;
+
+	for( y = 0; y < tile->height; y++ ) {
+		VipsPel *p = VIPS_REGION_ADDR( region, 
+			tile->left, tile->top + y );
+
+		for( i = 0; i < b; i++ ) {
+			opj_image_comp_t *comp = &image->comps[i];
+                        int *q = comp->data + y * comp->w; 
+
+			switch( im->BandFmt ) {
+			case VIPS_FORMAT_CHAR:
+			case VIPS_FORMAT_UCHAR:
+				UNPACK( int, unsigned char );
+				break;
+
+			case VIPS_FORMAT_SHORT:
+			case VIPS_FORMAT_USHORT:
+				UNPACK( int, unsigned short );
+				break;
+
+			case VIPS_FORMAT_INT:
+			case VIPS_FORMAT_UINT:
+				UNPACK( int, unsigned int );
+				break;
+
+			default:
+				g_assert_not_reached();
+				break;
+			}
+		}
+	}
+}
+
+void
+vips__foreign_load_jp2k_compress_free( TileCompress *compress )
+{
+	VIPS_FREEF( opj_destroy_codec, compress->codec );
+	VIPS_FREEF( opj_image_destroy, compress->image );
+	VIPS_FREEF( opj_stream_destroy, compress->stream );
+	VIPS_FREE( compress->accumulate );
+}
+
+/* Compress area @tile within @region and write to @target as a @tile_width by
+ * @tile_height jp2k compressed image. This is called from eg. vips2tiff to 
+ * write jp2k-compressed tiles.
+ *
+ * You'd think we could reuse things like the encoder between calls but ...
+ * nope, openjpeg does not allow that.
+ */
+int
+vips__foreign_load_jp2k_compress( VipsRegion *region, 
+	VipsRect *tile, VipsTarget *target,
+	int tile_width, int tile_height,
+        gboolean save_as_ycc, gboolean subsample, gboolean lossless, int Q )
+{
+	TileCompress compress = { 0 };
+	opj_cparameters_t parameters;
+	size_t sizeof_line;
+
+	/* Our rgb->ycc only works for exactly 3 bands.
+	 */
+	save_as_ycc = save_as_ycc && region->im->Bands == 3;
+	subsample = subsample && save_as_ycc;
+
+	/* Set compression params.
+	 */
+	opj_set_default_encoder_parameters( &parameters );
+
+	/* Set compression profile.
+	 */
+	vips_foreign_save_jp2k_set_profile( &parameters, lossless, Q ); 
+
+	/* Makes three band images smaller, somehow.
+	 */
+	parameters.tcp_mct = region->im->Bands >= 3 ? 1 : 0;
+
+	/* Create output image. TRUE means we alloc memory for the image
+	 * planes.
+	 */
+	if( !(compress.image = vips_foreign_save_jp2k_new_image( region->im,
+		tile_width, tile_height, subsample, save_as_ycc, TRUE )) ) {
+		vips__foreign_load_jp2k_compress_free( &compress );
+		return( -1 );
+	}
+
+	/* We need a line of sums for chroma subsample. At worst, gint64.
+	 */
+	sizeof_line = sizeof( gint64 ) * tile->width;
+	if( !(compress.accumulate = 
+		VIPS_ARRAY( NULL, sizeof_line, VipsPel )) ) {
+		vips__foreign_load_jp2k_compress_free( &compress );
+		return( -1 );
+	}
+
+	compress.codec = opj_create_compress( OPJ_CODEC_J2K );
+	vips_foreign_save_jp2k_attach_handlers( compress.codec );
+        if( !opj_setup_encoder( compress.codec, 
+		&parameters, compress.image ) ) {
+		vips__foreign_load_jp2k_compress_free( &compress );
+		return( -1 );
+	}
+
+	opj_codec_set_threads( compress.codec, vips_concurrency_get() );
+
+	if( save_as_ycc ) 
+		vips_foreign_save_jp2k_rgb_to_ycc( region, 
+			tile, compress.image->comps[0].prec );
+
+	/* we need to unpack to the int arrays on comps[i].data
+	 */
+	if( subsample )
+		vips_foreign_save_jp2k_unpack_subsample_image( region,
+			tile, compress.image, 
+			compress.accumulate ); 
+	else
+		vips_foreign_save_jp2k_unpack_image( region,
+			tile, compress.image ); 
+
+	if( !(compress.stream = vips_foreign_save_jp2k_target( target )) ) {
+		vips__foreign_load_jp2k_compress_free( &compress );
+		return( -1 );
+	}
+
+	if( !opj_start_compress( compress.codec, 
+		compress.image, compress.stream ) ) {
+		vips__foreign_load_jp2k_compress_free( &compress );
+		return( -1 );
+	}
+
+	if( !opj_encode( compress.codec, compress.stream ) ) {
+		vips__foreign_load_jp2k_compress_free( &compress );
+		return( -1 );
+	}
+
+	opj_end_compress( compress.codec, compress.stream );
+
+	vips__foreign_load_jp2k_compress_free( &compress );
+
+	return( 0 );
+}
+
+#else /*!HAVE_LIBOPENJP2*/
+
+int
+vips__foreign_load_jp2k_compress( VipsRegion *region, 
+	VipsRect *tile, VipsTarget *target,
+	int tile_width, int tile_height,
+        gboolean save_as_ycc, gboolean subsample, gboolean lossless, int Q )
+{
+	vips_error( "jp2k", 
+		"%s", _( "libvips built without JPEG2000 support" ) );
+	return( -1 );
+}
+
 #endif /*HAVE_LIBOPENJP2*/
 
 /**
@@ -1067,9 +1418,9 @@ vips_foreign_save_jp2k_target_init( VipsForeignSaveJp2kTarget *target )
  * Write a VIPS image to a file in JPEG2000 format. 
  * The saver supports 8, 16 and 32-bit int pixel
  * values, signed and unsigned. It supports greyscale, RGB, CMYK and
- * multispectral images. 
+ * multispectral images.
  *
- * Use @Q to set the compression quality factor. The default value of 45
+ * Use @Q to set the compression quality factor. The default value
  * produces file with approximately the same size as regular JPEG Q 75.
  *
  * Set @lossless to enable lossless compresion.
@@ -1113,7 +1464,7 @@ vips_jp2ksave( VipsImage *in, const char *filename, ... )
  * * @tile_height: %gint for tile size
  * * @subsample_mode: #VipsForeignSubsample, chroma subsampling mode
  *
- * As vips_jp2ksave(), but save to a memory buffer.
+ * As vips_jp2ksave(), but save to a target.
  *
  * See also: vips_jp2ksave(), vips_image_write_to_target().
  *
diff --git a/libvips/foreign/libnsgif/Makefile.am b/libvips/foreign/libnsgif/Makefile.am
index 26a31fb6..ef81a9c2 100644
--- a/libvips/foreign/libnsgif/Makefile.am
+++ b/libvips/foreign/libnsgif/Makefile.am
@@ -1,3 +1,15 @@
+noinst_LTLIBRARIES = libnsgif.la
+
+MY_SOURCES = \
+	libnsgif.h \
+	libnsgif.c \
+	lzw.c \
+	lzw.h
+
+if ENABLE_NSGIF
+libnsgif_la_SOURCES = $(MY_SOURCES)
+endif
+
 EXTRA_DIST = \
 	README-ns \
 	README.md \
@@ -5,11 +17,7 @@ EXTRA_DIST = \
 	update.sh \
 	utils 
 
-noinst_LTLIBRARIES = libnsgif.la
-
-libnsgif_la_SOURCES = \
-	libnsgif.h \
-	libnsgif.c \
-	lzw.c \
-	lzw.h
-
+if !ENABLE_NSGIF
+EXTRA_DIST += \
+	$(MY_SOURCES) 
+endif
diff --git a/libvips/foreign/pforeign.h b/libvips/foreign/pforeign.h
index fd3e3728..4c6b045c 100644
--- a/libvips/foreign/pforeign.h
+++ b/libvips/foreign/pforeign.h
@@ -64,7 +64,8 @@ int vips__tiff_write( VipsImage *in, const char *filename,
 	int level, 
 	gboolean lossless,
 	VipsForeignDzDepth depth,
-	gboolean subifd );
+	gboolean subifd, 
+	gboolean premultiply );
 
 int vips__tiff_write_buf( VipsImage *in, 
 	void **obuf, size_t *olen, 
@@ -83,7 +84,8 @@ int vips__tiff_write_buf( VipsImage *in,
 	int level, 
 	gboolean lossless,
 	VipsForeignDzDepth depth,
-	gboolean subifd );
+	gboolean subifd,
+	gboolean premultiply );
 
 gboolean vips__istiff_source( VipsSource *source );
 gboolean vips__istifftiled_source( VipsSource *source );
@@ -237,6 +239,14 @@ struct heif_image;
 void vips__heif_image_print( struct heif_image *img );
 
 extern const char *vips__jp2k_suffs[];
+int vips__foreign_load_jp2k_decompress( VipsImage *out,
+	int width, int height, gboolean ycc_to_rgb,
+	void *from, size_t from_length,
+	void *to, size_t to_length );
+int vips__foreign_load_jp2k_compress( VipsRegion *region,
+	VipsRect *tile, VipsTarget *target,
+	int tile_width, int tile_height,
+	gboolean save_as_ycc, gboolean subsample, gboolean lossless, int Q );
 
 extern const char *vips__jxl_suffs[];
 
diff --git a/libvips/foreign/tiff2vips.c b/libvips/foreign/tiff2vips.c
index be8a8fe0..122f9d0b 100644
--- a/libvips/foreign/tiff2vips.c
+++ b/libvips/foreign/tiff2vips.c
@@ -201,6 +201,8 @@
  * 	- add subifd
  * 6/6/20 MathemanFlo
  * 	- support 2 and 4 bit greyscale load
+ * 27/3/21
+ * 	- add jp2k decompresion
  */
 
 /*
@@ -253,6 +255,23 @@
 #include "pforeign.h"
 #include "tiff.h"
 
+/* Aperio TIFFs (svs) use these compression types for jp2k-compressed tiles.
+ */
+#define JP2K_YCC 33003
+#define JP2K_RGB 33005
+
+/* Bioformats uses this tag for jp2k compressed tiles.
+ */
+#define JP2K_LOSSY 33004
+
+/* Compression types we handle ourselves.
+ */
+static int rtiff_we_decompress[] = {
+	JP2K_YCC,
+	JP2K_RGB,
+	JP2K_LOSSY
+};
+
 /* What we read from the tiff dir to set our read strategy. For multipage
  * read, we need to read and compare lots of these, so it needs to be broken
  * out as a separate thing.
@@ -316,6 +335,11 @@ typedef struct _RtiffHeader {
 	 */
 	char *image_description;
 
+	/* TRUE if the compression type is not supported by libtiff directly
+	 * and we must read the raw data and decompress ourselves.
+	 */
+	gboolean we_decompress;
+
 } RtiffHeader;
 
 /* Scanline-type process function.
@@ -371,6 +395,12 @@ typedef struct _Rtiff {
 	 */
 	tdata_t contig_buf;
 
+	/* If we are decompressing, we need a buffer to read the raw tile to
+	 * before running the decompressor.
+	 */
+	tdata_t compressed_buf;
+	tsize_t compressed_buf_length;
+
 	/* The Y we are reading at. Used to verify strip read is sequential.
 	 */
 	int y_pos;
@@ -1720,12 +1750,47 @@ static int
 rtiff_read_tile( Rtiff *rtiff, tdata_t *buf, int x, int y )
 {
 #ifdef DEBUG_VERBOSE
-	printf( "rtiff_read_tile: x = %d, y = %d\n", x, y ); 
+	printf( "rtiff_read_tile: x = %d, y = %d, we_decompress = %d\n", 
+		x, y, rtiff->header.we_decompress ); 
 #endif /*DEBUG_VERBOSE*/
 
-	if( TIFFReadTile( rtiff->tiff, buf, x, y, 0, 0 ) < 0 ) { 
-		vips_foreign_load_invalidate( rtiff->out );
-		return( -1 ); 
+	if( rtiff->header.we_decompress ) {
+		ttile_t tile_no = TIFFComputeTile( rtiff->tiff, x, y, 0, 0 );
+
+		tsize_t size;
+
+		size = TIFFReadRawTile( rtiff->tiff, tile_no, 
+			rtiff->compressed_buf, rtiff->compressed_buf_length );
+		if( size <= 0 ) {
+			vips_foreign_load_invalidate( rtiff->out );
+			return( -1 ); 
+		}
+
+		switch( rtiff->header.compression ) {
+		case JP2K_YCC:
+		case JP2K_RGB:
+		case JP2K_LOSSY:
+			if( vips__foreign_load_jp2k_decompress( 
+				rtiff->out, 
+				rtiff->header.tile_width, 
+				rtiff->header.tile_height,
+				TRUE,
+				rtiff->compressed_buf, size,
+				buf, rtiff->header.tile_size ) ) 
+				return( -1 );
+			break;
+
+		default:
+			g_assert_not_reached();
+			break;
+		}
+
+	}
+	else {
+		if( TIFFReadTile( rtiff->tiff, buf, x, y, 0, 0 ) < 0 ) { 
+			vips_foreign_load_invalidate( rtiff->out );
+			return( -1 ); 
+		}
 	}
 
 	return( 0 ); 
@@ -1982,6 +2047,20 @@ rtiff_read_tilewise( Rtiff *rtiff, VipsImage *out )
 		return( -1 );
 	}
 
+	/* If we will be decompressing, we need a buffer large enough to hold
+	 * the largest compressed tile in any page.
+	 *
+	 * Allocate a buffer 2x the uncompressed tile size ... much simpler
+	 * than searching every page for the largest tile with
+	 * TIFFTAG_TILEBYTECOUNTS.
+	 */
+	if( rtiff->header.we_decompress ) {
+		rtiff->compressed_buf_length = 2 * rtiff->header.tile_size;
+		if( !(rtiff->compressed_buf = vips_malloc( VIPS_OBJECT( out ), 
+			rtiff->compressed_buf_length )) )
+			return( -1 );
+	}
+
 	/* Read to this image, then cache to out, see below.
 	 */
 	t[0] = vips_image_new(); 
@@ -2355,6 +2434,7 @@ rtiff_read_stripwise( Rtiff *rtiff, VipsImage *out )
 static int
 rtiff_header_read( Rtiff *rtiff, RtiffHeader *header )
 {
+	int i;
 	uint16 extra_samples_count;
 	uint16 *extra_samples_types;
 	toff_t *subifd_offsets;
@@ -2377,6 +2457,19 @@ rtiff_header_read( Rtiff *rtiff, RtiffHeader *header )
 	TIFFGetFieldDefaulted( rtiff->tiff, 
 		TIFFTAG_COMPRESSION, &header->compression );
 
+	/* One of the types we decompress?
+	 */
+	for( i = 0; i < VIPS_NUMBER( rtiff_we_decompress ); i++ )
+		if( header->compression == rtiff_we_decompress[i] ) {
+#ifdef DEBUG
+			printf( "rtiff_header_read: "
+				"compression %d handled by us\n", 
+				header->compression );
+#endif /*DEBUG*/
+			header->we_decompress = TRUE;
+			break;
+		}
+
 	/* We must set this here since it'll change the value of scanline_size.
 	 */
 	rtiff_set_decode_format( rtiff );
@@ -2471,6 +2564,8 @@ rtiff_header_read( Rtiff *rtiff, RtiffHeader *header )
 	 */
 	header->tiled = TIFFIsTiled( rtiff->tiff );
 
+
+
 #ifdef DEBUG
 	printf( "rtiff_header_read: header.width = %d\n", 
 		header->width );
@@ -2652,6 +2747,7 @@ rtiff_header_equal( RtiffHeader *h1, RtiffHeader *h2 )
 		h1->photometric_interpretation != 
 			h2->photometric_interpretation ||
 		h1->sample_format != h2->sample_format ||
+		h1->compression != h2->compression ||
 		h1->separate != h2->separate ||
 		h1->tiled != h2->tiled ||
 		h1->orientation != h2->orientation )
diff --git a/libvips/foreign/tiffsave.c b/libvips/foreign/tiffsave.c
index f9a4db2e..ef7a156a 100644
--- a/libvips/foreign/tiffsave.c
+++ b/libvips/foreign/tiffsave.c
@@ -26,6 +26,8 @@
  * 8/6/20
  * 	- add bitdepth support for 2 and 4 bit greyscale images
  * 	- deprecate "squash"
+ * 1/5/21
+ * 	- add "premultiply" flag
  */
 
 /*
@@ -105,6 +107,7 @@ typedef struct _VipsForeignSaveTiff {
 	gboolean lossless;
 	VipsForeignDzDepth depth;
 	gboolean subifd;
+	gboolean premultiply;
 
 } VipsForeignSaveTiff;
 
@@ -341,13 +344,20 @@ vips_foreign_save_tiff_class_init( VipsForeignSaveTiffClass *class )
 		G_STRUCT_OFFSET( VipsForeignSaveTiff, depth ),
 		VIPS_TYPE_FOREIGN_DZ_DEPTH, VIPS_FOREIGN_DZ_DEPTH_ONETILE ); 
 
-	VIPS_ARG_BOOL( class, "subifd", 24, 
+	VIPS_ARG_BOOL( class, "subifd", 26, 
 		_( "Sub-IFD" ), 
 		_( "Save pyr layers as sub-IFDs" ),
 		VIPS_ARGUMENT_OPTIONAL_INPUT,
 		G_STRUCT_OFFSET( VipsForeignSaveTiff, subifd ),
 		FALSE );
 
+	VIPS_ARG_BOOL( class, "premultiply", 27, 
+		_( "Premultiply" ), 
+		_( "Save with premultiplied alpha" ),
+		VIPS_ARGUMENT_OPTIONAL_INPUT,
+		G_STRUCT_OFFSET( VipsForeignSaveTiff, premultiply ),
+		FALSE );
+
 	VIPS_ARG_BOOL( class, "rgbjpeg", 20, 
 		_( "RGB JPEG" ),
 		_( "Output RGB JPEG rather than YCbCr" ),
@@ -427,7 +437,8 @@ vips_foreign_save_tiff_file_build( VipsObject *object )
 		tiff->level,
 		tiff->lossless,
 		tiff->depth,
-		tiff->subifd ) )
+		tiff->subifd,
+		tiff->premultiply ) )
 		return( -1 );
 
 	return( 0 );
@@ -500,7 +511,8 @@ vips_foreign_save_tiff_buffer_build( VipsObject *object )
 		tiff->level,
 		tiff->lossless, 
 		tiff->depth,
-		tiff->subifd ) )
+		tiff->subifd,
+		tiff->premultiply ) )
 		return( -1 );
 
 	blob = vips_blob_new( (VipsCallbackFn) vips_area_free_cb, obuf, olen );
@@ -567,6 +579,7 @@ vips_foreign_save_tiff_buffer_init( VipsForeignSaveTiffBuffer *buffer )
  * * @lossless: %gboolean, WebP losssless mode
  * * @depth: #VipsForeignDzDepth how deep to make the pyramid
  * * @subifd: %gboolean write pyr layers as sub-ifds
+ * * @premultiply: %gboolean write premultiplied alpha
  *
  * Write a VIPS image to a file as TIFF.
  *
@@ -658,6 +671,9 @@ vips_foreign_save_tiff_buffer_init( VipsForeignSaveTiffBuffer *buffer )
  * Set @subifd to save pyramid layers as sub-directories of the main image.
  * Setting this option can improve compatibility with formats like OME.
  *
+ * Set @premultiply tio save with premultiplied alpha. Some programs, such as
+ * InDesign, will only work with premultiplied alpha.
+ *
  * See also: vips_tiffload(), vips_image_write_to_file().
  *
  * Returns: 0 on success, -1 on error.
@@ -704,6 +720,7 @@ vips_tiffsave( VipsImage *in, const char *filename, ... )
  * * @lossless: %gboolean, WebP losssless mode
  * * @depth: #VipsForeignDzDepth how deep to make the pyramid
  * * @subifd: %gboolean write pyr layers as sub-ifds
+ * * @premultiply: %gboolean write premultiplied alpha
  *
  * As vips_tiffsave(), but save to a memory buffer. 
  *
diff --git a/libvips/foreign/vips2heif.c b/libvips/foreign/vips2heif.c
index bff14c9f..0dfcfb1c 100644
--- a/libvips/foreign/vips2heif.c
+++ b/libvips/foreign/vips2heif.c
@@ -513,7 +513,7 @@ vips_foreign_save_heif_class_init( VipsForeignSaveHeifClass *class )
 		_( "CPU effort" ),
 		VIPS_ARGUMENT_OPTIONAL_INPUT,
 		G_STRUCT_OFFSET( VipsForeignSaveHeif, speed ),
-		0, 8, 5 );
+		0, 9, 5 );
 
 	VIPS_ARG_ENUM( class, "subsample_mode", 16,
 		_( "Subsample mode" ),
diff --git a/libvips/foreign/vips2tiff.c b/libvips/foreign/vips2tiff.c
index 626b36ba..ca2e0c2b 100644
--- a/libvips/foreign/vips2tiff.c
+++ b/libvips/foreign/vips2tiff.c
@@ -270,6 +270,16 @@
  */
 #define MAX_ALPHA (64)
 
+/* Bioformats uses this tag for lossy jp2k compressed tiles.
+ */
+#define JP2K_LOSSY 33004
+
+/* Compression types we handle ourselves.
+ */
+static int wtiff_we_compress[] = {
+	JP2K_LOSSY
+};
+
 typedef struct _Layer Layer;
 typedef struct _Wtiff Wtiff;
 
@@ -351,9 +361,10 @@ struct _Wtiff {
 	int strip;			/* Don't write metadata */
 	VipsRegionShrink region_shrink; /* How to shrink regions */
 	int level;			/* zstd compression level */
-	gboolean lossless;		/* webp lossless mode */
+	gboolean lossless;		/* lossless mode */
 	VipsForeignDzDepth depth;	/* Pyr depth */
 	gboolean subifd;		/* Write pyr layers into subifds */
+	gboolean premultiply;		/* Premultiply alpha */
 
 	/* True if we've detected a toilet-roll image, plus the page height,
 	 * which has been checked to be a factor of im->Ysize. page_number
@@ -368,6 +379,16 @@ struct _Wtiff {
 	 * roll mode.
 	 */
 	int image_height;
+
+	/* TRUE if the compression type is not supported by libtiff directly
+	 * and we must compress ourselves. 
+	 */
+	gboolean we_compress;
+
+	/* If we are copying, we need a buffer to read the compressed tile to.
+	 */
+	tdata_t compressed_buf;
+	tsize_t compressed_buf_length;
 };
 
 /* Write an ICC Profile from a file into the JPEG stream.
@@ -636,6 +657,7 @@ wtiff_write_header( Wtiff *wtiff, Layer *layer )
 {
 	TIFF *tif = layer->tif;
 
+	int i;
 	int orientation; 
 
 #ifdef DEBUG
@@ -672,6 +694,12 @@ wtiff_write_header( Wtiff *wtiff, Layer *layer )
 		wtiff->predictor != VIPS_FOREIGN_TIFF_PREDICTOR_NONE ) 
 		TIFFSetField( tif, TIFFTAG_PREDICTOR, wtiff->predictor );
 
+	for( i = 0; i < VIPS_NUMBER( wtiff_we_compress ); i++ )
+		if( wtiff->compression == wtiff_we_compress[i] ) {
+			wtiff->we_compress = TRUE;
+			break;
+		}
+
 	/* Don't write mad resolutions (eg. zero), it confuses some programs.
 	 */
 	TIFFSetField( tif, TIFFTAG_RESOLUTIONUNIT, wtiff->resunit );
@@ -791,9 +819,14 @@ wtiff_write_header( Wtiff *wtiff, Layer *layer )
 			/* EXTRASAMPLE_UNASSALPHA means generic extra
 			 * alpha-like channels. ASSOCALPHA means pre-multipled
 			 * alpha only. 
+			 *
+			 * Make the first channel the premultiplied alpha, if
+			 * we are premultiplying.
 			 */
 			for( i = 0; i < alpha_bands; i++ )
-				v[i] = EXTRASAMPLE_UNASSALPHA;
+				v[i] = i == 0 && wtiff->premultiply ? 
+					EXTRASAMPLE_ASSOCALPHA :
+					EXTRASAMPLE_UNASSALPHA;
 			TIFFSetField( tif, 
 				TIFFTAG_EXTRASAMPLES, alpha_bands, v );
 		}
@@ -926,6 +959,20 @@ wtiff_allocate_layers( Wtiff *wtiff )
 			return( -1 );
 	}
 
+	/* If we will be copying layers we need a buffer large enough to hold
+	 * the largest compressed tile in any page.
+	 *
+	 * Allocate a buffer 2x the uncompressed tile size ... much simpler
+	 * than searching every page for the largest tile with
+	 * TIFFTAG_TILEBYTECOUNTS.
+	 */
+	if( wtiff->pyramid ) {
+		wtiff->compressed_buf_length = 2 * wtiff->tls * wtiff->tileh;
+		if( !(wtiff->compressed_buf = vips_malloc( NULL,
+			wtiff->compressed_buf_length )) )
+			return( -1 );
+	}
+
 	return( 0 );
 }
 
@@ -981,11 +1028,11 @@ static void
 wtiff_free( Wtiff *wtiff )
 {
 	wtiff_delete_temps( wtiff );
-
 	VIPS_UNREF( wtiff->ready );
 	VIPS_FREE( wtiff->tbuf );
 	VIPS_FREEF( layer_free_all, wtiff->layer );
 	VIPS_FREE( wtiff->filename );
+	VIPS_FREE( wtiff->compressed_buf );
 	VIPS_FREE( wtiff );
 }
 
@@ -1011,6 +1058,8 @@ get_compression( VipsForeignTiffCompression compression )
 	case VIPS_FOREIGN_TIFF_COMPRESSION_ZSTD:
 		return( COMPRESSION_ZSTD );
 #endif /*HAVE_TIFF_COMPRESSION_WEBP*/
+	case VIPS_FOREIGN_TIFF_COMPRESSION_JP2K:
+		return( JP2K_LOSSY );
 	
 	default:
 		return( COMPRESSION_NONE );
@@ -1040,24 +1089,56 @@ get_resunit( VipsForeignTiffResunit resunit )
 static int
 ready_to_write( Wtiff *wtiff )
 {
-	if( vips_check_coding_known( "vips2tiff", wtiff->input ) )
+	VipsImage *input;
+	VipsImage *x;
+
+	input = wtiff->input;
+	g_object_ref( input );
+
+	if( vips_check_coding_known( "vips2tiff", input ) ) {
+		VIPS_UNREF( input );
 		return( -1 );
+	}
+
+	/* Premultiply any alpha, if necessary.
+	 */
+	if( wtiff->premultiply &&
+		vips_image_hasalpha( input ) ) {
+		VipsBandFormat start_format = input->BandFmt;
+
+		if( vips_premultiply( input, &x, NULL ) ) {
+			VIPS_UNREF( input );
+			return( -1 );
+		}
+		VIPS_UNREF( input );
+		input = x;
+
+		/* Premultiply always makes a float -- cast back again.
+		 */
+		if( vips_cast( input, &x, start_format, NULL ) ) {
+			VIPS_UNREF( input );
+			return( -1 );
+		}
+		VIPS_UNREF( input );
+		input = x;
+	}
 
 	/* "squash" float LAB down to LABQ.
 	 */
 	if( wtiff->bitdepth &&
-		wtiff->input->Bands == 3 &&
-		wtiff->input->BandFmt == VIPS_FORMAT_FLOAT &&
-		wtiff->input->Type == VIPS_INTERPRETATION_LAB ) {
-		if( vips_Lab2LabQ( wtiff->input, &wtiff->ready, NULL ) )
+		input->Bands == 3 &&
+		input->BandFmt == VIPS_FORMAT_FLOAT &&
+		input->Type == VIPS_INTERPRETATION_LAB ) {
+		if( vips_Lab2LabQ( input, &x, NULL ) ) {
+			VIPS_UNREF( input );
 			return( -1 );
-		wtiff->bitdepth = 0;
-	}
-	else {
-		wtiff->ready = wtiff->input;
-		g_object_ref( wtiff->ready );
+		}
+		VIPS_UNREF( input );
+		input = x;
 	}
 
+	wtiff->ready = input;
+
 	return( 0 );
 }
 
@@ -1079,7 +1160,8 @@ wtiff_new( VipsImage *input, const char *filename,
 	int level, 
 	gboolean lossless,
 	VipsForeignDzDepth depth, 
-	gboolean subifd )
+	gboolean subifd,
+	gboolean premultiply )
 {
 	Wtiff *wtiff;
 
@@ -1112,6 +1194,7 @@ wtiff_new( VipsImage *input, const char *filename,
 	wtiff->lossless = lossless;
 	wtiff->depth = depth;
 	wtiff->subifd = subifd;
+	wtiff->premultiply = premultiply;
 	wtiff->toilet_roll = FALSE;
 	wtiff->page_height = vips_image_get_page_height( input );
 	wtiff->page_number = 0;
@@ -1237,21 +1320,6 @@ wtiff_new( VipsImage *input, const char *filename,
 		wtiff->miniswhite = FALSE;
 	}
 
-	/* lossless is for webp only.
-	 */
-#ifdef HAVE_TIFF_COMPRESSION_WEBP
-	if( wtiff->lossless ) {
-		if( wtiff->compression == COMPRESSION_NONE )
-			wtiff->compression = COMPRESSION_WEBP;
-
-		if( wtiff->compression != COMPRESSION_WEBP ) {
-			g_warning( "%s", 
-				_( "lossless is for WEBP compression only" ) );
-			wtiff->lossless = FALSE;
-		}
-	}
-#endif /*HAVE_TIFF_COMPRESSION_WEBP*/
-
 	/* Sizeof a line of bytes in the TIFF tile.
 	 */
 	if( wtiff->ready->Coding == VIPS_CODING_LABQ )
@@ -1489,7 +1557,7 @@ wtiff_pack2tiff( Wtiff *wtiff, Layer *layer,
 /* Write a set of tiles across the strip.
  */
 static int
-wtiff_layer_write_tile( Wtiff *wtiff, Layer *layer, VipsRegion *strip )
+wtiff_layer_write_tiles( Wtiff *wtiff, Layer *layer, VipsRegion *strip )
 {
 	VipsImage *im = layer->image;
 	VipsRect *area = &strip->valid;
@@ -1511,21 +1579,88 @@ wtiff_layer_write_tile( Wtiff *wtiff, Layer *layer, VipsRegion *strip )
 		tile.height = wtiff->tileh;
 		vips_rect_intersectrect( &tile, &image, &tile );
 
-		/* Have to repack pixels.
-		 */
-		wtiff_pack2tiff( wtiff, layer, strip, &tile, wtiff->tbuf );
-
 #ifdef DEBUG_VERBOSE
 		printf( "Writing %dx%d tile at position %dx%d to image %s\n",
 			tile.width, tile.height, tile.left, tile.top,
 			TIFFFileName( layer->tif ) );
 #endif /*DEBUG_VERBOSE*/
 
-		if( TIFFWriteTile( layer->tif, wtiff->tbuf, 
-			tile.left, tile.top, 0, 0 ) < 0 ) {
-			vips_error( "vips2tiff", 
-				"%s", _( "TIFF write tile failed" ) );
-			return( -1 );
+		if( wtiff->we_compress ) {
+			ttile_t tile_no = TIFFComputeTile( layer->tif,
+				tile.left, tile.top, 0, 0 );
+
+			VipsTarget *target;
+			int result;
+			unsigned char *buffer;
+			size_t length;
+
+			target = vips_target_new_to_memory();
+
+			switch( wtiff->compression ) {
+			case JP2K_LOSSY:
+				/* Sadly chroma subsample seems not to work
+				 * for edge tiles in tiff with jp2k
+				 * compression, so we always pass FALSE
+				 * instead of:
+				 *
+				 * 	!wtiff->rgbjpeg && wtiff->Q < 90,
+				 *
+				 * I've verified that the libvips jp2k
+				 * encode and decode subsample operations fill
+				 * the comps[i].data arrays correctly, so it
+				 * seems to be a openjpeg bug.
+				 *
+				 * FIXME ... try again with openjpeg 2.5,
+				 * when that comes.
+				 */
+				result = vips__foreign_load_jp2k_compress( 
+					strip, &tile, target,
+					wtiff->tilew, wtiff->tileh,
+					!wtiff->rgbjpeg,
+				 	// !wtiff->rgbjpeg && wtiff->Q < 90,
+					FALSE,
+					wtiff->lossless, 
+					wtiff->Q );
+				break;
+
+			default:
+				result = -1;
+				g_assert_not_reached();
+				break;
+			}
+
+			if( result ) {
+				g_object_unref( target );
+				return( -1 );
+			}
+
+			buffer = vips_target_steal( target, &length );
+
+			g_object_unref( target );
+
+			result = TIFFWriteRawTile( layer->tif, tile_no, 
+				buffer, length );
+
+			g_free( buffer );
+		
+			if( result < 0 ) {
+				vips_error( "vips2tiff", 
+					"%s", _( "TIFF write tile failed" ) );
+				return( -1 );
+			}
+		}
+		else {
+			/* Have to repack pixels for libtiff.
+			 */
+			wtiff_pack2tiff( wtiff, 
+				layer, strip, &tile, wtiff->tbuf );
+
+			if( TIFFWriteTile( layer->tif, wtiff->tbuf, 
+				tile.left, tile.top, 0, 0 ) < 0 ) {
+				vips_error( "vips2tiff", 
+					"%s", _( "TIFF write tile failed" ) );
+				return( -1 );
+			}
 		}
 	}
 
@@ -1672,7 +1807,7 @@ layer_strip_arrived( Layer *layer )
 	VipsRect image_area;
 
 	if( wtiff->tile ) 
-		result = wtiff_layer_write_tile( wtiff, layer, layer->strip );
+		result = wtiff_layer_write_tiles( wtiff, layer, layer->strip );
 	else
 		result = wtiff_layer_write_strip( wtiff, layer, layer->strip );
 	if( result )
@@ -1803,7 +1938,7 @@ wtiff_copy_tiff( Wtiff *wtiff, TIFF *out, TIFF *in )
 	uint16 ui16_2;
 	float f;
 	tdata_t buf;
-	ttile_t tile;
+	ttile_t tile_no;
 	ttile_t n;
 	uint16 *a;
 
@@ -1891,19 +2026,15 @@ wtiff_copy_tiff( Wtiff *wtiff, TIFF *out, TIFF *in )
 
 	buf = vips_malloc( NULL, TIFFTileSize( in ) );
 	n = TIFFNumberOfTiles( in );
-	for( tile = 0; tile < n; tile++ ) {
+	for( tile_no = 0; tile_no < n; tile_no++ ) {
 		tsize_t len;
 
-		/* It'd be good to use TIFFReadRawTile()/TIFFWtiffRawTile() 
-		 * here to save compression/decompression, but sadly it seems
-		 * not to work :-( investigate at some point.
-		 */
-		len = TIFFReadEncodedTile( in, tile, buf, -1 );
-		if( len < 0 ||
-			TIFFWriteEncodedTile( out, tile, buf, len ) < 0 ) {
-			g_free( buf );
+		len = TIFFReadRawTile( in, tile_no, 
+			wtiff->compressed_buf, wtiff->compressed_buf_length );
+		if( len <= 0 ||
+			TIFFWriteRawTile( out, tile_no, 
+				wtiff->compressed_buf, len ) < 0 )
 			return( -1 );
-		}
 	}
 	g_free( buf );
 
@@ -2085,7 +2216,8 @@ vips__tiff_write( VipsImage *input, const char *filename,
 	int level, 
 	gboolean lossless,
 	VipsForeignDzDepth depth,
-	gboolean subifd )
+	gboolean subifd,
+	gboolean premultiply )
 {
 	Wtiff *wtiff;
 
@@ -2100,7 +2232,7 @@ vips__tiff_write( VipsImage *input, const char *filename,
                 tile, tile_width, tile_height, pyramid, bitdepth,
 		miniswhite, resunit, xres, yres, bigtiff, rgbjpeg, 
 		properties, strip, region_shrink, level, lossless, depth,
-		subifd )) )
+		subifd, premultiply )) )
 		return( -1 );
 
 	if( wtiff_write_image( wtiff ) ) { 
@@ -2131,7 +2263,8 @@ vips__tiff_write_buf( VipsImage *input,
 	int level, 
 	gboolean lossless,
 	VipsForeignDzDepth depth,
-	gboolean subifd )
+	gboolean subifd,
+	gboolean premultiply )
 {
 	Wtiff *wtiff;
 
@@ -2142,7 +2275,7 @@ vips__tiff_write_buf( VipsImage *input,
                 tile, tile_width, tile_height, pyramid, bitdepth,
 		miniswhite, resunit, xres, yres, bigtiff, rgbjpeg, 
 		properties, strip, region_shrink, level, lossless, depth,
-		subifd )) )
+		subifd, premultiply )) )
 		return( -1 );
 
 	wtiff->obuf = obuf;
diff --git a/libvips/include/vips/foreign.h b/libvips/include/vips/foreign.h
index 15e114c0..0d19e38b 100644
--- a/libvips/include/vips/foreign.h
+++ b/libvips/include/vips/foreign.h
@@ -465,6 +465,7 @@ int vips_webpsave_mime( VipsImage *in, ... )
  * @VIPS_FOREIGN_TIFF_COMPRESSION_LZW: LZW compression
  * @VIPS_FOREIGN_TIFF_COMPRESSION_WEBP: WEBP compression
  * @VIPS_FOREIGN_TIFF_COMPRESSION_ZSTD: ZSTD compression
+ * @VIPS_FOREIGN_TIFF_COMPRESSION_JP2K: JP2K compression
  *
  * The compression types supported by the tiff writer.
  *
@@ -485,6 +486,7 @@ typedef enum {
 	VIPS_FOREIGN_TIFF_COMPRESSION_LZW,
 	VIPS_FOREIGN_TIFF_COMPRESSION_WEBP,
 	VIPS_FOREIGN_TIFF_COMPRESSION_ZSTD,
+	VIPS_FOREIGN_TIFF_COMPRESSION_JP2K,
 	VIPS_FOREIGN_TIFF_COMPRESSION_LAST
 } VipsForeignTiffCompression;
 
diff --git a/libvips/include/vips/region.h b/libvips/include/vips/region.h
index 9d7a0e6c..964fee5d 100644
--- a/libvips/include/vips/region.h
+++ b/libvips/include/vips/region.h
@@ -165,18 +165,16 @@ void vips_region_invalidate( VipsRegion *reg );
 #define VIPS_COUNT_PIXELS( R, N ) 
 #endif /*DEBUG_LEAK*/
 
-/* Macros on VipsRegion.
- *	VIPS_REGION_LSKIP()		add to move down line
- *	VIPS_REGION_N_ELEMENTS()	number of elements across region
- *	VIPS_REGION_SIZEOF_LINE()	sizeof width of region
- *	VIPS_REGION_ADDR()		address of pixel in region
- */
 #define VIPS_REGION_LSKIP( R ) \
 	((size_t)((R)->bpl))
 #define VIPS_REGION_N_ELEMENTS( R ) \
 	((size_t)((R)->valid.width * (R)->im->Bands))
+#define VIPS_REGION_SIZEOF_ELEMENT( R ) \
+	(VIPS_IMAGE_SIZEOF_ELEMENT( (R)->im ))
+#define VIPS_REGION_SIZEOF_PEL( R ) \
+	(VIPS_IMAGE_SIZEOF_PEL( (R)->im ))
 #define VIPS_REGION_SIZEOF_LINE( R ) \
-	((size_t)((R)->valid.width * VIPS_IMAGE_SIZEOF_PEL( (R)->im) ))
+	((size_t)((R)->valid.width * VIPS_REGION_SIZEOF_PEL( R )))
 
 /* If DEBUG is defined, add bounds checking.
  */
@@ -184,7 +182,7 @@ void vips_region_invalidate( VipsRegion *reg );
 #define VIPS_REGION_ADDR( R, X, Y ) \
 	( (vips_rect_includespoint( &(R)->valid, (X), (Y) ))? \
 	  ((R)->data + ((Y) - (R)->valid.top) * VIPS_REGION_LSKIP(R) + \
-	  ((X) - (R)->valid.left) * VIPS_IMAGE_SIZEOF_PEL((R)->im)): \
+	  ((X) - (R)->valid.left) * VIPS_REGION_SIZEOF_PEL( R )): \
 	  (fprintf( stderr, \
 		"VIPS_REGION_ADDR: point out of bounds, " \
 		"file \"%s\", line %d\n" \
@@ -202,7 +200,7 @@ void vips_region_invalidate( VipsRegion *reg );
 #define VIPS_REGION_ADDR( R, X, Y ) \
 	((R)->data + \
 	((Y)-(R)->valid.top) * VIPS_REGION_LSKIP( R ) + \
-	((X)-(R)->valid.left) * VIPS_IMAGE_SIZEOF_PEL( (R)->im ))
+	((X)-(R)->valid.left) * VIPS_REGION_SIZEOF_PEL( R ))
 #endif /*DEBUG*/
 
 #define VIPS_REGION_ADDR_TOPLEFT( R ) ((R)->data)
diff --git a/libvips/iofuncs/enumtypes.c b/libvips/iofuncs/enumtypes.c
index d3cc9bfa..d7fa839d 100644
--- a/libvips/iofuncs/enumtypes.c
+++ b/libvips/iofuncs/enumtypes.c
@@ -574,6 +574,7 @@ vips_foreign_tiff_compression_get_type( void )
 			{VIPS_FOREIGN_TIFF_COMPRESSION_LZW, "VIPS_FOREIGN_TIFF_COMPRESSION_LZW", "lzw"},
 			{VIPS_FOREIGN_TIFF_COMPRESSION_WEBP, "VIPS_FOREIGN_TIFF_COMPRESSION_WEBP", "webp"},
 			{VIPS_FOREIGN_TIFF_COMPRESSION_ZSTD, "VIPS_FOREIGN_TIFF_COMPRESSION_ZSTD", "zstd"},
+			{VIPS_FOREIGN_TIFF_COMPRESSION_JP2K, "VIPS_FOREIGN_TIFF_COMPRESSION_JP2K", "jp2k"},
 			{VIPS_FOREIGN_TIFF_COMPRESSION_LAST, "VIPS_FOREIGN_TIFF_COMPRESSION_LAST", "last"},
 			{0, NULL, NULL}
 		};
diff --git a/libvips/module/Makefile.am b/libvips/module/Makefile.am
new file mode 100644
index 00000000..744b79df
--- /dev/null
+++ b/libvips/module/Makefile.am
@@ -0,0 +1,5 @@
+EXTRA_DIST = \
+	heif.c \
+	magick.c \
+	openslide.c \
+	poppler.c
diff --git a/test/test-suite/test_foreign.py b/test/test-suite/test_foreign.py
index 43ca6a3b..d1a82e73 100644
--- a/test/test-suite/test_foreign.py
+++ b/test/test-suite/test_foreign.py
@@ -17,7 +17,7 @@ from helpers import \
     GIF_ANIM_DISPOSE_PREVIOUS_EXPECTED_PNG_FILE, \
     temp_filename, assert_almost_equal_objects, have, skip_if_no, \
     TIF1_FILE, TIF2_FILE, TIF4_FILE, WEBP_LOOKS_LIKE_SVG_FILE, \
-    WEBP_ANIMATED_FILE, JP2K_FILE
+    WEBP_ANIMATED_FILE, JP2K_FILE, RGBA_FILE
 
 class TestForeign:
     tempdir = None
@@ -27,6 +27,7 @@ class TestForeign:
         cls.tempdir = tempfile.mkdtemp()
 
         cls.colour = pyvips.Image.jpegload(JPEG_FILE)
+        cls.rgba = pyvips.Image.new_from_file(RGBA_FILE)
         cls.mono = cls.colour.extract_band(1).copy()
         # we remove the ICC profile: the RGB one will no longer be appropriate
         cls.mono.remove("icc-profile-data")
@@ -387,15 +388,14 @@ class TestForeign:
         self.save_load("%s.tif", self.mono)
         self.save_load("%s.tif", self.colour)
         self.save_load("%s.tif", self.cmyk)
-
+        self.save_load("%s.tif", self.rgba)
         self.save_load("%s.tif", self.onebit)
+
         self.save_load_file(".tif", "[bitdepth=1]", self.onebit)
         self.save_load_file(".tif", "[miniswhite]", self.onebit)
         self.save_load_file(".tif", "[bitdepth=1,miniswhite]", self.onebit)
 
-        self.save_load_file(".tif",
-                            "[profile={0}]".format(SRGB_FILE),
-                            self.colour)
+        self.save_load_file(".tif", f"[profile={SRGB_FILE}]", self.colour)
         self.save_load_file(".tif", "[tile]", self.colour)
         self.save_load_file(".tif", "[tile,pyramid]", self.colour)
         self.save_load_file(".tif", "[tile,pyramid,subifd]", self.colour)
@@ -510,11 +510,17 @@ class TestForeign:
             buf2 = f.read()
         assert len(buf) == len(buf2)
 
+        filename = temp_filename(self.tempdir, '.tif')
+        self.rgba.write_to_file(filename, premultiply=True)
+        a = pyvips.Image.new_from_file(filename)
+        b = self.rgba.premultiply().cast("uchar").unpremultiply().cast("uchar")
+        assert (a == b).min() == 255
+
         a = pyvips.Image.new_from_buffer(buf, "", page=2)
         b = pyvips.Image.new_from_buffer(buf2, "", page=2)
         assert a.width == b.width
         assert a.height == b.height
-        assert a.avg() == b.avg()
+        assert (a == b).min() == 255
 
         # just 0/255 in each band, shrink with mode and all pixels should be 0
         # or 255 in layer 1
@@ -525,6 +531,16 @@ class TestForeign:
             z = y.hist_find(band=0)
             assert z(0, 0)[0] + z(255, 0)[0] == y.width * y.height
 
+    @skip_if_no("jp2kload")
+    @skip_if_no("tiffload")
+    def test_tiffjp2k(self):
+        self.save_load_file(".tif", "[tile,compression=jp2k]", self.colour, 80)
+        self.save_load_file(".tif",
+                            "[tile,pyramid,compression=jp2k]", self.colour, 80)
+        self.save_load_file(".tif",
+                            "[tile,pyramid,subifd,compression=jp2k]",
+                            self.colour, 80)
+
     @skip_if_no("magickload")
     def test_magickload(self):
         def bmp_valid(im):