allow utf-8 header for svg detection (#2481)

* allow utf-8 header for svg detection

We were checking that the first 24 chars of an SVG were plain ASCII,
but that's not always the case, for example:

	<svg id="レイヤー_1のコピー"
		data-name="レイヤー 1のコピー"
		xmlns="http://www.w3.org/2000/svg"
		viewBox="0 0 100 100">
	</svg>

We now test for the string "<svg" being in the first 1000 bytes, and
everything up to that being valid utf-8.

See https://github.com/libvips/libvips/issues/2438

* raise priority of webpload

it was very low priority before, for some reason
This commit is contained in:
John Cupitt 2021-10-15 13:21:50 +01:00 committed by GitHub
parent e1a7063999
commit b2527da531
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 89 additions and 16 deletions

View File

@ -22,6 +22,8 @@
* 11/6/21
* - switch to rsvg_handle_render_document()
* - librsvg can no longer render very large images :(
* 14/10/21
* - allow utf-8 headers for svg detection
*/
/*
@ -131,6 +133,85 @@ vips_foreign_load_svg_zfree( void *opaque, void *ptr )
}
#endif /*HANDLE_SVGZ*/
/* Find a utf-8 substring within the first len_bytes (not characters).
*
* - case-insensitive
* - needle must be zero-terminated, but hackstack need not be
* - haystack can be null-terminated
* - if haystack is shorter than len bytes, that'll end the search
* - if we hit invalid utf-8, we return NULL
*/
static const char *
vips_utf8_strcasestr( const char *haystack_start, const char *needle_start,
int len_bytes )
{
int needle_len = g_utf8_strlen( needle_start, -1 );
int needle_len_bytes = strlen( needle_start );
const char *haystack;
for( haystack = haystack_start;
haystack - haystack_start <= len_bytes - needle_len_bytes;
haystack = g_utf8_find_next_char( haystack, NULL ) ) {
const char *needle_char;
const char *haystack_char;
int i;
haystack_char = haystack;
needle_char = needle_start;
for( i = 0; i < needle_len; i++ ) {
/* Haystack isn't necessarily null-terminated and
* might end half-way through a utf-8 character, so we
* need to be careful not to run off the end.
*/
gunichar a =
g_utf8_get_char_validated( haystack_char,
haystack_start + len_bytes - haystack );
gunichar b =
g_utf8_get_char_validated( needle_char, -1 );
/* Invalid utf8?
*
* gunichar is a uint32, so we can't compare < 0, we
* have to look for -1 and -2 (the two possible error
* values).
*/
if( a == (gunichar) -1 ||
a == (gunichar) -2 ||
b == (gunichar) -1 ||
b == (gunichar) -2 )
return( NULL );
/* End of haystack. There can't be a complete needle
* anywhere.
*/
if( a == (gunichar) 0 )
return( NULL );
/* Mismatch.
*/
if( g_unichar_tolower( a ) != g_unichar_tolower( b ) )
break;
haystack_char =
g_utf8_find_next_char( haystack_char,
haystack_start + len_bytes );
needle_char =
g_utf8_find_next_char( needle_char, NULL );
}
if( i == needle_len )
/* Walked the whole of needle, so we must have found a
* complete match.
*/
return( haystack );
}
/* Walked the whole of haystack without finding a match.
*/
return( NULL );
}
/* This is used by both the file and buffer subclasses.
*/
static gboolean
@ -145,8 +226,6 @@ vips_foreign_load_svg_is_a( const void *buf, size_t len )
char obuf[SVG_HEADER_SIZE];
#endif /*HANDLE_SVGZ*/
int i;
/* Start with str pointing at the argument buffer, swap to it pointing
* into obuf if we see zip data.
*/
@ -200,23 +279,17 @@ vips_foreign_load_svg_is_a( const void *buf, size_t len )
*
* But there can be a doctype in there too. And case and whitespace can
* vary a lot. And the <?xml can be missing. And you can have a comment
* before the <svg line.
* before the <svg line. And it can be utf-8, so non ASCII characters.
*
* Simple rules:
* - first 24 chars are plain ascii (x09-x7F)
* - first SVG_HEADER_SIZE chars contain "<svg", upper or lower case.
* All we do is look for "<svg", any case, within the first
* SVG_HEADER_SIZE bytes, where the bytestream up to the "<svg" is
* valid utf-8.
*
* We could rsvg_handle_new_from_data() on the buffer, but that can be
* horribly slow for large documents.
*/
if( len < 24 )
return( 0 );
for( i = 0; i < 24; i++ )
if( !isascii( str[i] ) || str[i] < 9 )
return( FALSE );
for( i = 0; i < SVG_HEADER_SIZE && i < len - 5; i++ )
if( g_ascii_strncasecmp( str + i, "<svg", 4 ) == 0 )
return( TRUE );
if( vips_utf8_strcasestr( str, "<svg", len ) )
return( TRUE );
return( FALSE );
}

View File

@ -166,9 +166,9 @@ vips_foreign_load_webp_class_init( VipsForeignLoadWebpClass *class )
object_class->description = _( "load webp" );
object_class->build = vips_foreign_load_webp_build;
/* is_a() is not that quick ... lower the priority.
/* We are fast at is_a(), so high priority.
*/
foreign_class->priority = -50;
foreign_class->priority = 200;
load_class->get_flags_filename =
vips_foreign_load_webp_get_flags_filename;