From 2003b5b5243712921be3df1d14139c5a884c7aee Mon Sep 17 00:00:00 2001 From: John Cupitt Date: Fri, 23 Sep 2011 22:01:32 +0100 Subject: [PATCH] csv read knows about quoted strings you can "enclose strings in \", and" it shouldn't get confused --- ChangeLog | 1 + TODO | 9 ++----- libvips/format/im_csv2vips.c | 51 ++++++++++++++++++++++++++++++++---- 3 files changed, 49 insertions(+), 12 deletions(-) diff --git a/ChangeLog b/ChangeLog index 828d2734..064dd8a0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -15,6 +15,7 @@ - vips_tracked_malloc() tracks allocation size and can report total mem usage - cache limits, drop, init, flush plus command-line controls - remove dmalloc support, was never used and valgrind is better +- im_csv2vips() allows quoted strings, including escaped quotes 10/8/11 started 7.26.3 - don't use G_VALUE_COLLECT_INIT(), many platforms do not have a glib this diff --git a/TODO b/TODO index cbce91e0..bfb8e0d1 100644 --- a/TODO +++ b/TODO @@ -5,16 +5,11 @@ -- im_csv2vips() gets confused by quotes and commas, eg. - - NP_001121179.1,"serine proteinase inhibitor, clade A, member ",ITPNLAEFAFSLYR,0.95588235294118,0.96176470588235, ... - - - - - add vips_init_argv() which processes argc/argv for you? handy for tiny progs, perhaps +- add vips_shutdown()? unload plugins, drop cache etc. + diff --git a/libvips/format/im_csv2vips.c b/libvips/format/im_csv2vips.c index 731a4a0d..c8e9369a 100644 --- a/libvips/format/im_csv2vips.c +++ b/libvips/format/im_csv2vips.c @@ -13,6 +13,8 @@ * - gtkdoc * 1/3/10 * - allow lines that end with EOF rather than \n + * 23/9/11 + * - allow quoted strings, including escaped quotes */ /* @@ -93,6 +95,29 @@ skip_white( FILE *fp, const char whitemap[256] ) return( ch ); } +static int +skip_to_quote( FILE *fp ) +{ + int ch; + + do { + ch = fgetc( fp ); + + /* We let people escape " in strings. + */ + if( ch == '\\' ) { + ch = fgetc( fp ); + + if( ch != EOF && ch != '\n' ) + ch = fgetc( fp ); + } + } while (ch != EOF && ch != '\n' && ch != '"' ); + + ungetc( ch, fp ); + + return( ch ); +} + static int skip_to_sep( FILE *fp, const char sepmap[256] ) { @@ -109,7 +134,15 @@ skip_to_sep( FILE *fp, const char sepmap[256] ) /* Read a single item. Syntax is: * - * item : whitespace* double? whitespace* [EOF|EOL|separator] + * element : + * whitespace* item whitespace* [EOF|EOL|separator] + * + * item : + * double | + * "anything" | + * empty + * + * the anything in quotes can contain " escaped with \ * * Return the char that caused failure on fail (EOF or \n). */ @@ -127,7 +160,12 @@ read_double( FILE *fp, const char whitemap[256], const char sepmap[256], if( ch == EOF || ch == '\n' ) return( ch ); - if( !sepmap[ch] && fscanf( fp, "%lf", out ) != 1 ) { + if( ch == '"' ) { + (void) fgetc( fp ); + ch = skip_to_quote( fp ); + ch = fgetc( fp ); + } + else if( !sepmap[ch] && fscanf( fp, "%lf", out ) != 1 ) { /* Only a warning, since (for example) exported spreadsheets * will often have text or date fields. */ @@ -273,6 +311,10 @@ read_csv( FILE *fp, IMAGE *out, * * Load a CSV (comma-separated values) file. The output image is always 1 * band (monochrome), %IM_BANDFMT_DOUBLE. + * + * Items in lines can be either floats, or strings enclosed in double-quotes. + * You can use a backslash (\) within the quotes to escape special characters. + * * The reader is deliberately rather fussy: it will fail if there are any * short lines, or if the file is too short. It will ignore lines that are * too long. @@ -297,8 +339,7 @@ read_csv( FILE *fp, IMAGE *out, * * * whi:whitespace-characters - * The skippable whitespace characters. Default space and - * double quotes ("). + * The skippable whitespace characters. Default space. * Whitespace characters are always run together. * * @@ -330,7 +371,7 @@ im_csv2vips( const char *filename, IMAGE *out ) /* Read options. */ int start_skip = 0; - char *whitespace = " \""; + char *whitespace = " "; char *separator = ";,\t"; int lines = -1;