diff --git a/src/wp-admin/includes/upgrade.php b/src/wp-admin/includes/upgrade.php index 146d6a8750..ece8daf349 100644 --- a/src/wp-admin/includes/upgrade.php +++ b/src/wp-admin/includes/upgrade.php @@ -527,7 +527,7 @@ function upgrade_all() { if ( $wp_current_db_version < 31351 ) upgrade_420(); - if ( $wp_current_db_version < 32308 ) + if ( $wp_current_db_version < 32364 ) upgrade_430(); maybe_disable_link_manager(); @@ -1446,17 +1446,33 @@ function upgrade_420() { function upgrade_430() { global $wp_current_db_version, $wpdb; - if ( $wp_current_db_version < 32308 ) { + if ( $wp_current_db_version < 32364 ) { $content_length = $wpdb->get_col_length( $wpdb->comments, 'comment_content' ); - if ( ! $content_length ) { - $content_length = 65535; + if ( false === $content_length ) { + $content_length = array( + 'type' => 'byte', + 'length' => 65535, + ); + } elseif ( ! is_array( $content_length ) ) { + $length = (int) $content_length > 0 ? (int) $content_length : 65535; + $content_length = array( + 'type' => 'byte', + 'length' => $length + ); } + if ( 'byte' !== $content_length['type'] ) { + // Sites with malformed DB schemas are on their own. + return; + } + + $allowed_length = intval( $content_length['length'] ) - 10; + $comments = $wpdb->get_results( - "SELECT comment_ID FROM $wpdb->comments - WHERE comment_date_gmt > '2015-04-26' - AND CHAR_LENGTH( comment_content ) >= $content_length - AND ( comment_content LIKE '%<%' OR comment_content LIKE '%>%' )" + "SELECT `comment_ID` FROM `{$wpdb->comments}` + WHERE `comment_date_gmt` > '2015-04-26' + AND LENGTH( `comment_content` ) >= {$allowed_length} + AND ( `comment_content` LIKE '%<%' OR `comment_content` LIKE '%>%' )" ); foreach ( $comments as $comment ) { diff --git a/src/wp-includes/comment.php b/src/wp-includes/comment.php index 75c77b1d91..e2ed55386a 100644 --- a/src/wp-includes/comment.php +++ b/src/wp-includes/comment.php @@ -2118,17 +2118,7 @@ function wp_insert_comment( $commentdata ) { $compacted = compact( 'comment_post_ID', 'comment_author', 'comment_author_email', 'comment_author_url', 'comment_author_IP', 'comment_date', 'comment_date_gmt', 'comment_content', 'comment_karma', 'comment_approved', 'comment_agent', 'comment_type', 'comment_parent', 'user_id' ); if ( ! $wpdb->insert( $wpdb->comments, $compacted ) ) { - $fields = array( 'comment_author', 'comment_author_email', 'comment_author_url', 'comment_content' ); - - foreach( $fields as $field ) { - if ( isset( $compacted[ $field ] ) ) { - $compacted[ $field ] = $wpdb->strip_invalid_text_for_column( $wpdb->comments, $field, $compacted[ $field ] ); - } - } - - if ( ! $wpdb->insert( $wpdb->comments, $compacted ) ) { - return false; - } + return false; } $id = (int) $wpdb->insert_id; @@ -2252,6 +2242,8 @@ function wp_throttle_comment_flood($block, $time_lastcomment, $time_newcomment) * @return int|bool The ID of the comment on success, false on failure. */ function wp_new_comment( $commentdata ) { + global $wpdb; + if ( isset( $commentdata['user_ID'] ) ) { $commentdata['user_id'] = $commentdata['user_ID'] = (int) $commentdata['user_ID']; } @@ -2295,7 +2287,22 @@ function wp_new_comment( $commentdata ) { $comment_ID = wp_insert_comment($commentdata); if ( ! $comment_ID ) { - return false; + $fields = array( 'comment_author', 'comment_author_email', 'comment_author_url', 'comment_content' ); + + foreach( $fields as $field ) { + if ( isset( $commentdata[ $field ] ) ) { + $commentdata[ $field ] = $wpdb->strip_invalid_text_for_column( $wpdb->comments, $field, $commentdata[ $field ] ); + } + } + + $commentdata = wp_filter_comment( $commentdata ); + + $commentdata['comment_approved'] = wp_allow_comment( $commentdata ); + + $comment_ID = wp_insert_comment( $commentdata ); + if ( ! $comment_ID ) { + return false; + } } /** diff --git a/src/wp-includes/compat.php b/src/wp-includes/compat.php index 43667053f1..4317eb689b 100644 --- a/src/wp-includes/compat.php +++ b/src/wp-includes/compat.php @@ -13,23 +13,85 @@ if ( !function_exists('_') ) { } } +/** + * Returns whether PCRE/u (PCRE_UTF8 modifier) is available for use. + * + * @ignore + * @since 4.2.2 + * @access private + * + * @param bool $set - Used for testing only + * null : default - get PCRE/u capability + * false : Used for testing - return false for future calls to this function + * 'reset': Used for testing - restore default behavior of this function + */ +function _wp_can_use_pcre_u( $set = null ) { + static $utf8_pcre = 'reset'; + + if ( null !== $set ) { + $utf8_pcre = $set; + } + + if ( 'reset' === $utf8_pcre ) { + $utf8_pcre = @preg_match( '/^./u', 'a' ); + } + + return $utf8_pcre; +} + if ( ! function_exists( 'mb_substr' ) ) : function mb_substr( $str, $start, $length = null, $encoding = null ) { return _mb_substr( $str, $start, $length, $encoding ); } endif; +/* + * Only understands UTF-8 and 8bit. All other character sets will be treated as 8bit. + * For $encoding === UTF-8, the $str input is expected to be a valid UTF-8 byte sequence. + * The behavior of this function for invalid inputs is undefined. + */ function _mb_substr( $str, $start, $length = null, $encoding = null ) { + if ( null === $encoding ) { + $encoding = get_option( 'blog_charset' ); + } + // The solution below works only for UTF-8, // so in case of a different charset just use built-in substr() - $charset = get_option( 'blog_charset' ); - if ( ! in_array( $charset, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ) ) ) { + if ( ! in_array( $encoding, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ) ) ) { return is_null( $length ) ? substr( $str, $start ) : substr( $str, $start, $length ); } - // Use the regex unicode support to separate the UTF-8 characters into an array - preg_match_all( '/./us', $str, $match ); - $chars = is_null( $length ) ? array_slice( $match[0], $start ) : array_slice( $match[0], $start, $length ); - return implode( '', $chars ); + + if ( _wp_can_use_pcre_u() ) { + // Use the regex unicode support to separate the UTF-8 characters into an array + preg_match_all( '/./us', $str, $match ); + $chars = is_null( $length ) ? array_slice( $match[0], $start ) : array_slice( $match[0], $start, $length ); + return implode( '', $chars ); + } + + $regex = '/( + [\x00-\x7F] # single-byte sequences 0xxxxxxx + | [\xC2-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx + | \xE0[\xA0-\xBF][\x80-\xBF] # triple-byte sequences 1110xxxx 10xxxxxx * 2 + | [\xE1-\xEC][\x80-\xBF]{2} + | \xED[\x80-\x9F][\x80-\xBF] + | [\xEE-\xEF][\x80-\xBF]{2} + | \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences 11110xxx 10xxxxxx * 3 + | [\xF1-\xF3][\x80-\xBF]{3} + | \xF4[\x80-\x8F][\x80-\xBF]{2} + )/x'; + + $chars = array( '' ); // Start with 1 element instead of 0 since the first thing we do is pop + do { + // We had some string left over from the last round, but we counted it in that last round. + array_pop( $chars ); + + // Split by UTF-8 character, limit to 1000 characters (last array element will contain the rest of the string) + $pieces = preg_split( $regex, $str, 1000, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ); + + $chars = array_merge( $chars, $pieces ); + } while ( count( $pieces ) > 1 && $str = array_pop( $pieces ) ); // If there's anything left over, repeat the loop. + + return join( '', array_slice( $chars, $start, $length ) ); } if ( ! function_exists( 'mb_strlen' ) ) : @@ -38,16 +100,54 @@ if ( ! function_exists( 'mb_strlen' ) ) : } endif; +/* + * Only understands UTF-8 and 8bit. All other character sets will be treated as 8bit. + * For $encoding === UTF-8, the $str input is expected to be a valid UTF-8 byte sequence. + * The behavior of this function for invalid inputs is undefined. + */ function _mb_strlen( $str, $encoding = null ) { + if ( null === $encoding ) { + $encoding = get_option( 'blog_charset' ); + } + // The solution below works only for UTF-8, // so in case of a different charset just use built-in strlen() - $charset = get_option( 'blog_charset' ); - if ( ! in_array( $charset, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ) ) ) { + if ( ! in_array( $encoding, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ) ) ) { return strlen( $str ); } - // Use the regex unicode support to separate the UTF-8 characters into an array - preg_match_all( '/./us', $str, $match ); - return count( $match[0] ); + + if ( _wp_can_use_pcre_u() ) { + // Use the regex unicode support to separate the UTF-8 characters into an array + preg_match_all( '/./us', $str, $match ); + return count( $match[0] ); + } + + $regex = '/(?: + [\x00-\x7F] # single-byte sequences 0xxxxxxx + | [\xC2-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx + | \xE0[\xA0-\xBF][\x80-\xBF] # triple-byte sequences 1110xxxx 10xxxxxx * 2 + | [\xE1-\xEC][\x80-\xBF]{2} + | \xED[\x80-\x9F][\x80-\xBF] + | [\xEE-\xEF][\x80-\xBF]{2} + | \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences 11110xxx 10xxxxxx * 3 + | [\xF1-\xF3][\x80-\xBF]{3} + | \xF4[\x80-\x8F][\x80-\xBF]{2} + )/x'; + + $count = 1; // Start at 1 instead of 0 since the first thing we do is decrement + do { + // We had some string left over from the last round, but we counted it in that last round. + $count--; + + // Split by UTF-8 character, limit to 1000 characters (last array element will contain the rest of the string) + $pieces = preg_split( $regex, $str, 1000 ); + + // Increment + $count += count( $pieces ); + } while ( $str = array_pop( $pieces ) ); // If there's anything left over, repeat the loop. + + // Fencepost: preg_split() always returns one extra item in the array + return --$count; } if ( !function_exists('hash_hmac') ): diff --git a/src/wp-includes/version.php b/src/wp-includes/version.php index ff960d7b73..4d0af000bf 100644 --- a/src/wp-includes/version.php +++ b/src/wp-includes/version.php @@ -11,7 +11,7 @@ $wp_version = '4.3-alpha-32280-src'; * * @global int $wp_db_version */ -$wp_db_version = 32308; +$wp_db_version = 32364; /** * Holds the TinyMCE version diff --git a/src/wp-includes/wp-db.php b/src/wp-includes/wp-db.php index 0bd7a1882b..4be05e6ec6 100644 --- a/src/wp-includes/wp-db.php +++ b/src/wp-includes/wp-db.php @@ -1809,6 +1809,8 @@ class wpdb { * @return int|false The number of rows affected, or false on error. */ function _insert_replace_helper( $table, $data, $format = null, $type = 'INSERT' ) { + $this->insert_id = 0; + if ( ! in_array( strtoupper( $type ), array( 'REPLACE', 'INSERT' ) ) ) { return false; } @@ -1829,7 +1831,6 @@ class wpdb { $sql = "$type INTO `$table` ($fields) VALUES ($formats)"; - $this->insert_id = 0; $this->check_current_query = false; return $this->query( $this->prepare( $sql, $values ) ); } @@ -2021,17 +2022,11 @@ class wpdb { // We can skip this field if we know it isn't a string. // This checks %d/%f versus ! %s because it's sprintf() could take more. $value['charset'] = false; - } elseif ( $this->check_ascii( $value['value'] ) ) { - // If it's ASCII, then we don't need the charset. We can skip this field. - $value['charset'] = false; } else { $value['charset'] = $this->get_col_charset( $table, $field ); if ( is_wp_error( $value['charset'] ) ) { return false; } - - // This isn't ASCII. Don't have strip_invalid_text() re-check. - $value['ascii'] = false; } $data[ $field ] = $value; @@ -2064,10 +2059,6 @@ class wpdb { } } - if ( false !== $value['length'] && mb_strlen( $value['value'] ) > $value['length'] ) { - return false; - } - $data[ $field ] = $value; } @@ -2406,14 +2397,16 @@ class wpdb { /** * Retrieve the maximum string length allowed in a given column. + * The length may either be specified as a byte length or a character length. * * @since 4.2.1 * @access public * * @param string $table Table name. * @param string $column Column name. - * @return mixed Max column length as an int. False if the column has no - * length. WP_Error object if there was an error. + * @return mixed array( 'length' => (int), 'type' => 'byte' | 'char' ) + * false if the column has no length (for example, numeric column) + * WP_Error object if there was an error. */ public function get_col_length( $table, $column ) { $tablekey = strtolower( $table ); @@ -2446,27 +2439,47 @@ class wpdb { } switch( $type ) { - case 'binary': case 'char': - case 'varbinary': case 'varchar': - return $length; + return array( + 'type' => 'char', + 'length' => (int) $length, + ); + break; + case 'binary': + case 'varbinary': + return array( + 'type' => 'byte', + 'length' => (int) $length, + ); break; case 'tinyblob': case 'tinytext': - return 255; // 2^8 - 1 + return array( + 'type' => 'byte', + 'length' => 255, // 2^8 - 1 + ); break; case 'blob': case 'text': - return 65535; // 2^16 - 1 + return array( + 'type' => 'byte', + 'length' => 65535, // 2^16 - 1 + ); break; case 'mediumblob': case 'mediumtext': - return 16777215; // 2^24 - 1 + return array( + 'type' => 'byte', + 'length' => 16777215, // 2^24 - 1 + ); break; case 'longblob': case 'longtext': - return 4294967295; // 2^32 - 1 + return array( + 'type' => 'byte', + 'length' => 4294967295, // 2^32 - 1 + ); break; default: return false; @@ -2572,50 +2585,55 @@ class wpdb { * remove invalid characters, a WP_Error object is returned. */ protected function strip_invalid_text( $data ) { - // Some multibyte character sets that we can check in PHP. - $mb_charsets = array( - 'ascii' => 'ASCII', - 'big5' => 'BIG-5', - 'eucjpms' => 'eucJP-win', - 'gb2312' => 'EUC-CN', - 'ujis' => 'EUC-JP', - 'utf32' => 'UTF-32', - ); - - $supported_charsets = array(); - if ( function_exists( 'mb_list_encodings' ) ) { - $supported_charsets = mb_list_encodings(); - } - $db_check_string = false; foreach ( $data as &$value ) { $charset = $value['charset']; - // Column isn't a string, or is latin1, which will will happily store anything. - if ( false === $charset || 'latin1' === $charset ) { + if ( is_array( $value['length'] ) ) { + $length = $value['length']['length']; + } else { + $length = false; + } + + // There's no charset to work with. + if ( false === $charset ) { continue; } + // Column isn't a string. if ( ! is_string( $value['value'] ) ) { continue; } - // ASCII is always OK. - if ( ! isset( $value['ascii'] ) && $this->check_ascii( $value['value'] ) ) { - continue; + $truncate_by_byte_length = 'byte' === $value['length']['type']; + + $needs_validation = true; + if ( + // latin1 can store any byte sequence + 'latin1' === $charset + || + // ASCII is always OK. + ( ! isset( $value['ascii'] ) && $this->check_ascii( $value['value'] ) ) + ) { + $truncate_by_byte_length = true; + $needs_validation = false; } - // Convert the text locally. - if ( $supported_charsets ) { - if ( isset( $mb_charsets[ $charset ] ) && in_array( $mb_charsets[ $charset ], $supported_charsets ) ) { - $value['value'] = mb_convert_encoding( $value['value'], $mb_charsets[ $charset ], $mb_charsets[ $charset ] ); + if ( $truncate_by_byte_length ) { + mbstring_binary_safe_encoding(); + if ( false !== $length && strlen( $value['value'] ) > $length ) { + $value['value'] = substr( $value['value'], 0, $length ); + } + reset_mbstring_encoding(); + + if ( ! $needs_validation ) { continue; } } // utf8 can be handled by regex, which is a bunch faster than a DB lookup. - if ( 'utf8' === $charset || 'utf8mb3' === $charset || 'utf8mb4' === $charset ) { + if ( ( 'utf8' === $charset || 'utf8mb3' === $charset || 'utf8mb4' === $charset ) && function_exists( 'mb_strlen' ) ) { $regex = '/ ( (?: [\x00-\x7F] # single-byte sequences 0xxxxxxx @@ -2625,7 +2643,7 @@ class wpdb { | \xED[\x80-\x9F][\x80-\xBF] | [\xEE-\xEF][\x80-\xBF]{2}'; - if ( 'utf8mb4' === $charset) { + if ( 'utf8mb4' === $charset ) { $regex .= ' | \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences 11110xxx 10xxxxxx * 3 | [\xF1-\xF3][\x80-\xBF]{3} @@ -2638,6 +2656,11 @@ class wpdb { | . # anything else /x'; $value['value'] = preg_replace( $regex, '$1', $value['value'] ); + + + if ( false !== $length && mb_strlen( $value['value'], 'UTF-8' ) > $length ) { + $value['value'] = mb_substr( $value['value'], 0, $length, 'UTF-8' ); + } continue; } @@ -2654,8 +2677,14 @@ class wpdb { $queries[ $value['charset'] ] = array(); } - // Split the CONVERT() calls by charset, so we can make sure the connection is right - $queries[ $value['charset'] ][ $col ] = $this->prepare( "CONVERT( %s USING {$value['charset']} )", $value['value'] ); + // We're going to need to truncate by characters or bytes, depending on the length value we have. + if ( 'byte' === $value['length']['type'] ) { + // Split the CONVERT() calls by charset, so we can make sure the connection is right + $queries[ $value['charset'] ][ $col ] = $this->prepare( "CONVERT( LEFT( CONVERT( %s USING binary ), %d ) USING {$value['charset']} )", $value['value'], $value['length']['length'] ); + } else { + $queries[ $value['charset'] ][ $col ] = $this->prepare( "LEFT( CONVERT( %s USING {$value['charset']} ), %d )", $value['value'], $value['length']['length'] ); + } + unset( $data[ $col ]['db'] ); } } @@ -2674,16 +2703,19 @@ class wpdb { $this->check_current_query = false; - $row = $this->get_row( "SELECT " . implode( ', ', $query ), ARRAY_N ); + $sql = array(); + foreach ( $query as $column => $column_query ) { + $sql[] = $column_query . " AS x_$column"; + } + + $row = $this->get_row( "SELECT " . implode( ', ', $sql ), ARRAY_A ); if ( ! $row ) { $this->set_charset( $this->dbh, $connection_charset ); return new WP_Error( 'wpdb_strip_invalid_text_failure' ); } - $cols = array_keys( $query ); - $col_count = count( $cols ); - for ( $ii = 0; $ii < $col_count; $ii++ ) { - $data[ $cols[ $ii ] ]['value'] = $row[ $ii ]; + foreach ( array_keys( $query ) as $column ) { + $data[ $column ]['value'] = $row["x_$column"]; } } @@ -2725,6 +2757,7 @@ class wpdb { 'value' => $query, 'charset' => $charset, 'ascii' => false, + 'length' => false, ); $data = $this->strip_invalid_text( array( $data ) ); @@ -2747,7 +2780,7 @@ class wpdb { * @return string|WP_Error The converted string, or a WP_Error object if the conversion fails. */ public function strip_invalid_text_for_column( $table, $column, $value ) { - if ( ! is_string( $value ) || $this->check_ascii( $value ) ) { + if ( ! is_string( $value ) ) { return $value; } @@ -2764,7 +2797,7 @@ class wpdb { $column => array( 'value' => $value, 'charset' => $charset, - 'ascii' => false, + 'length' => $this->get_col_length( $table, $column ), ) ); diff --git a/tests/phpunit/tests/comment.php b/tests/phpunit/tests/comment.php index dbb01856ab..854619572d 100644 --- a/tests/phpunit/tests/comment.php +++ b/tests/phpunit/tests/comment.php @@ -121,7 +121,8 @@ class Tests_Comment extends WP_UnitTestCase { $_SERVER['REMOTE_ADDR'] = ''; } - $post_id = $this->factory->post->create(); + $u = $this->factory->user->create(); + $post_id = $this->factory->post->create( array( 'post_author' => $u ) ); $data = array( 'comment_post_ID' => $post_id, @@ -136,7 +137,9 @@ class Tests_Comment extends WP_UnitTestCase { $id = wp_new_comment( $data ); - $this->assertFalse( $id ); + $comment = get_comment( $id ); + + $this->assertEquals( strlen( $comment->comment_content ), 65535 ); // Cleanup. if ( isset( $remote_addr ) ) { diff --git a/tests/phpunit/tests/compat.php b/tests/phpunit/tests/compat.php index 3409dbca0b..5aa749225d 100644 --- a/tests/phpunit/tests/compat.php +++ b/tests/phpunit/tests/compat.php @@ -2,13 +2,166 @@ /** * @group compat + * @group security-153 */ class Tests_Compat extends WP_UnitTestCase { - function test_mb_substr() { - $this->assertEquals('баб', _mb_substr('баба', 0, 3)); - $this->assertEquals('баб', _mb_substr('баба', 0, -1)); - $this->assertEquals('баб', _mb_substr('баба', 0, -1)); - $this->assertEquals('I am your б', _mb_substr('I am your баба', 0, 11)); + function utf8_string_lengths() { + return array( + // string, character_length, byte_length + array( 'баба', 4, 8 ), + array( 'баб', 3, 6 ), + array( 'I am your б', 11, 12 ), + array( '1111111111', 10, 10 ), + array( '²²²²²²²²²²', 10, 20 ), + array( '3333333333', 10, 30 ), + array( '𝟜𝟜𝟜𝟜𝟜𝟜𝟜𝟜𝟜𝟜', 10, 40 ), + array( '1²3𝟜1²3𝟜1²3𝟜', 12, 30 ), + ); + } + + function utf8_substrings() { + return array( + // string, start, length, character_substring, byte_substring + array( 'баба', 0, 3, 'баб', "б\xD0" ), + array( 'баба', 0, -1, 'баб', "баб\xD0" ), + array( 'баба', 1, null, 'аба', "\xB1аба" ), + array( 'баба', -3, null, 'аба', "\xB1а" ), + array( 'баба', -3, 2, 'аб', "\xB1\xD0" ), + array( 'баба', -1, 2, 'а', "\xB0" ), + array( 'I am your баба', 0, 11, 'I am your б', "I am your \xD0" ), + ); + } + + /** + * @dataProvider utf8_string_lengths + */ + function test_mb_strlen( $string, $expected_character_length ) { + $this->assertEquals( $expected_character_length, _mb_strlen( $string, 'UTF-8' ) ); + } + + /** + * @dataProvider utf8_string_lengths + */ + function test_mb_strlen_via_regex( $string, $expected_character_length ) { + _wp_can_use_pcre_u( false ); + $this->assertEquals( $expected_character_length, _mb_strlen( $string, 'UTF-8' ) ); + _wp_can_use_pcre_u( 'reset' ); + } + + /** + * @dataProvider utf8_string_lengths + */ + function test_8bit_mb_strlen( $string, $expected_character_length, $expected_byte_length ) { + $this->assertEquals( $expected_byte_length, _mb_strlen( $string, '8bit' ) ); + } + + /** + * @dataProvider utf8_substrings + */ + function test_mb_substr( $string, $start, $length, $expected_character_substring ) { + $this->assertEquals( $expected_character_substring, _mb_substr( $string, $start, $length, 'UTF-8' ) ); + } + + /** + * @dataProvider utf8_substrings + */ + function test_mb_substr_via_regex( $string, $start, $length, $expected_character_substring ) { + _wp_can_use_pcre_u( false ); + $this->assertEquals( $expected_character_substring, _mb_substr( $string, $start, $length, 'UTF-8' ) ); + _wp_can_use_pcre_u( 'reset' ); + } + + /** + * @dataProvider utf8_substrings + */ + function test_8bit_mb_substr( $string, $start, $length, $expected_character_substring, $expected_byte_substring ) { + $this->assertEquals( $expected_byte_substring, _mb_substr( $string, $start, $length, '8bit' ) ); + } + + function test_mb_substr_phpcore(){ + /* https://github.com/php/php-src/blob/php-5.6.8/ext/mbstring/tests/mb_substr_basic.phpt */ + $string_ascii = 'ABCDEF'; + $string_mb = base64_decode('5pel5pys6Kqe44OG44Kt44K544OI44Gn44GZ44CCMDEyMzTvvJXvvJbvvJfvvJjvvJnjgII='); + + $this->assertEquals( 'DEF', _mb_substr($string_ascii, 3) ); + $this->assertEquals( 'DEF', _mb_substr($string_ascii, 3, 5, 'ISO-8859-1') ); + + // specific latin-1 as that is the default the core php test opporates under + $this->assertEquals( 'peacrOiqng==' , base64_encode( _mb_substr($string_mb, 2, 7, 'latin-1' ) ) ); + $this->assertEquals( '6Kqe44OG44Kt44K544OI44Gn44GZ', base64_encode( _mb_substr($string_mb, 2, 7, 'utf-8') ) ); + + /* https://github.com/php/php-src/blob/php-5.6.8/ext/mbstring/tests/mb_substr_variation1.phpt */ + $start = 0; + $length = 5; + $unset_var = 10; + unset ($unset_var); + $heredoc = <<assertEquals( $outputs[$iterator] , _mb_substr($input, $start, $length) ); + $iterator++; + } + } function test_hash_hmac_simple() { @@ -34,3 +187,10 @@ class Tests_Compat extends WP_UnitTestCase { $this->assertEquals( array( 'foo' ), $json->decode( '["foo"]' ) ); } } + +/* used in test_mb_substr_phpcore */ +class classA { + public function __toString() { + return "Class A object"; + } +} diff --git a/tests/phpunit/tests/db.php b/tests/phpunit/tests/db.php index 1df898756c..ce0a4a519d 100644 --- a/tests/phpunit/tests/db.php +++ b/tests/phpunit/tests/db.php @@ -746,7 +746,6 @@ class Tests_DB extends WP_UnitTestCase { 'value' => '¡foo foo foo!', 'format' => '%s', 'charset' => $expected_charset, - 'ascii' => false, 'length' => $wpdb->get_col_length( $wpdb->posts, 'post_content' ), ) ); diff --git a/tests/phpunit/tests/db/charset.php b/tests/phpunit/tests/db/charset.php index 4ae81d8ba8..cb06b2e885 100755 --- a/tests/phpunit/tests/db/charset.php +++ b/tests/phpunit/tests/db/charset.php @@ -6,6 +6,7 @@ require_once dirname( dirname( __FILE__ ) ) . '/db.php'; * Test WPDB methods * * @group wpdb + * @group security-153 */ class Tests_DB_Charset extends WP_UnitTestCase { @@ -28,57 +29,227 @@ class Tests_DB_Charset extends WP_UnitTestCase { // latin1. latin1 never changes. 'charset' => 'latin1', 'value' => "\xf0\x9f\x8e\xb7", - 'expected' => "\xf0\x9f\x8e\xb7" + 'expected' => "\xf0\x9f\x8e\xb7", + 'length' => array( 'type' => 'char', 'length' => 100 ), + ), + 'latin1_char_length' => array( + // latin1. latin1 never changes. + 'charset' => 'latin1', + 'value' => str_repeat( 'A', 11 ), + 'expected' => str_repeat( 'A', 10 ), + 'length' => array( 'type' => 'char', 'length' => 10 ), + ), + 'latin1_byte_length' => array( + // latin1. latin1 never changes. + 'charset' => 'latin1', + 'value' => str_repeat( 'A', 11 ), + 'expected' => str_repeat( 'A', 10 ), + 'length' => array( 'type' => 'byte', 'length' => 10 ), ), 'ascii' => array( // ascii gets special treatment, make sure it's covered 'charset' => 'ascii', 'value' => 'Hello World', - 'expected' => 'Hello World' + 'expected' => 'Hello World', + 'length' => array( 'type' => 'char', 'length' => 100 ), + ), + 'ascii_char_length' => array( + // ascii gets special treatment, make sure it's covered + 'charset' => 'ascii', + 'value' => str_repeat( 'A', 11 ), + 'expected' => str_repeat( 'A', 10 ), + 'length' => array( 'type' => 'char', 'length' => 10 ), + ), + 'ascii_byte_length' => array( + // ascii gets special treatment, make sure it's covered + 'charset' => 'ascii', + 'value' => str_repeat( 'A', 11 ), + 'expected' => str_repeat( 'A', 10 ), + 'length' => array( 'type' => 'byte', 'length' => 10 ), ), 'utf8' => array( // utf8 only allows <= 3-byte chars 'charset' => 'utf8', 'value' => "H€llo\xf0\x9f\x98\x88World¢", - 'expected' => 'H€lloWorld¢' + 'expected' => 'H€lloWorld¢', + 'length' => array( 'type' => 'char', 'length' => 100 ), + ), + 'utf8_23char_length' => array( + // utf8 only allows <= 3-byte chars + 'charset' => 'utf8', + 'value' => str_repeat( "²3", 10 ), + 'expected' => str_repeat( "²3", 5 ), + 'length' => array( 'type' => 'char', 'length' => 10 ), + ), + 'utf8_23byte_length' => array( + // utf8 only allows <= 3-byte chars + 'charset' => 'utf8', + 'value' => str_repeat( "²3", 10 ), + 'expected' => "²3²3", + 'length' => array( 'type' => 'byte', 'length' => 10 ), + ), + 'utf8_3char_length' => array( + // utf8 only allows <= 3-byte chars + 'charset' => 'utf8', + 'value' => str_repeat( "3", 11 ), + 'expected' => str_repeat( "3", 10 ), + 'length' => array( 'type' => 'char', 'length' => 10 ), + ), + 'utf8_3byte_length' => array( + // utf8 only allows <= 3-byte chars + 'charset' => 'utf8', + 'value' => str_repeat( "3", 11 ), + 'expected' => "333", + 'length' => array( 'type' => 'byte', 'length' => 10 ), ), 'utf8mb3' => array( // utf8mb3 should behave the same an utf8 'charset' => 'utf8mb3', 'value' => "H€llo\xf0\x9f\x98\x88World¢", - 'expected' => 'H€lloWorld¢' + 'expected' => 'H€lloWorld¢', + 'length' => array( 'type' => 'char', 'length' => 100 ), + ), + 'utf8mb3_23char_length' => array( + // utf8mb3 should behave the same an utf8 + 'charset' => 'utf8mb3', + 'value' => str_repeat( "²3", 10 ), + 'expected' => str_repeat( "²3", 5 ), + 'length' => array( 'type' => 'char', 'length' => 10 ), + ), + 'utf8mb3_23byte_length' => array( + // utf8mb3 should behave the same an utf8 + 'charset' => 'utf8mb3', + 'value' => str_repeat( "²3", 10 ), + 'expected' => "²3²3", + 'length' => array( 'type' => 'byte', 'length' => 10 ), + ), + 'utf8mb3_3char_length' => array( + // utf8mb3 should behave the same an utf8 + 'charset' => 'utf8mb3', + 'value' => str_repeat( "3", 11 ), + 'expected' => str_repeat( "3", 10 ), + 'length' => array( 'type' => 'char', 'length' => 10 ), + ), + 'utf8mb3_3byte_length' => array( + // utf8mb3 should behave the same an utf8 + 'charset' => 'utf8mb3', + 'value' => str_repeat( "3", 10 ), + 'expected' => "333", + 'length' => array( 'type' => 'byte', 'length' => 10 ), ), 'utf8mb4' => array( // utf8mb4 allows 4-byte characters, too 'charset' => 'utf8mb4', 'value' => "H€llo\xf0\x9f\x98\x88World¢", - 'expected' => "H€llo\xf0\x9f\x98\x88World¢" + 'expected' => "H€llo\xf0\x9f\x98\x88World¢", + 'length' => array( 'type' => 'char', 'length' => 100 ), + ), + 'utf8mb4_234char_length' => array( + // utf8mb4 allows 4-byte characters, too + 'charset' => 'utf8mb4', + 'value' => str_repeat( "²3𝟜", 10 ), + 'expected' => "²3𝟜²3𝟜²3𝟜²", + 'length' => array( 'type' => 'char', 'length' => 10 ), + ), + 'utf8mb4_234byte_length' => array( + // utf8mb4 allows 4-byte characters, too + 'charset' => 'utf8mb4', + 'value' => str_repeat( "²3𝟜", 10 ), + 'expected' => "²3𝟜", + 'length' => array( 'type' => 'byte', 'length' => 10 ), + ), + 'utf8mb4_4char_length' => array( + // utf8mb4 allows 4-byte characters, too + 'charset' => 'utf8mb4', + 'value' => str_repeat( "𝟜", 11 ), + 'expected' => str_repeat( "𝟜", 10 ), + 'length' => array( 'type' => 'char', 'length' => 10 ), + ), + 'utf8mb4_4byte_length' => array( + // utf8mb4 allows 4-byte characters, too + 'charset' => 'utf8mb4', + 'value' => str_repeat( "𝟜", 10 ), + 'expected' => "𝟜𝟜", + 'length' => array( 'type' => 'byte', 'length' => 10 ), ), 'koi8r' => array( 'charset' => 'koi8r', 'value' => "\xfdord\xf2ress", 'expected' => "\xfdord\xf2ress", + 'length' => array( 'type' => 'char', 'length' => 100 ), + ), + 'koi8r_char_length' => array( + 'charset' => 'koi8r', + 'value' => str_repeat( "\xfd\xf2", 10 ), + 'expected' => str_repeat( "\xfd\xf2", 5 ), + 'length' => array( 'type' => 'char', 'length' => 10 ), + ), + 'koi8r_byte_length' => array( + 'charset' => 'koi8r', + 'value' => str_repeat( "\xfd\xf2", 10 ), + 'expected' => str_repeat( "\xfd\xf2", 5 ), + 'length' => array( 'type' => 'byte', 'length' => 10 ), ), 'hebrew' => array( 'charset' => 'hebrew', 'value' => "\xf9ord\xf7ress", 'expected' => "\xf9ord\xf7ress", + 'length' => array( 'type' => 'char', 'length' => 100 ), + ), + 'hebrew_char_length' => array( + 'charset' => 'hebrew', + 'value' => str_repeat( "\xf9\xf7", 10 ), + 'expected' => str_repeat( "\xf9\xf7", 5 ), + 'length' => array( 'type' => 'char', 'length' => 10 ), + ), + 'hebrew_byte_length' => array( + 'charset' => 'hebrew', + 'value' => str_repeat( "\xf9\xf7", 10 ), + 'expected' => str_repeat( "\xf9\xf7", 5 ), + 'length' => array( 'type' => 'byte', 'length' => 10 ), ), 'cp1251' => array( 'charset' => 'cp1251', 'value' => "\xd8ord\xd0ress", 'expected' => "\xd8ord\xd0ress", + 'length' => array( 'type' => 'char', 'length' => 100 ), + ), + 'cp1251_char_length' => array( + 'charset' => 'cp1251', + 'value' => str_repeat( "\xd8\xd0", 10 ), + 'expected' => str_repeat( "\xd8\xd0", 5 ), + 'length' => array( 'type' => 'char', 'length' => 10 ), + ), + 'cp1251_byte_length' => array( + 'charset' => 'cp1251', + 'value' => str_repeat( "\xd8\xd0", 10 ), + 'expected' => str_repeat( "\xd8\xd0", 5 ), + 'length' => array( 'type' => 'byte', 'length' => 10 ), ), 'tis620' => array( 'charset' => 'tis620', 'value' => "\xccord\xe3ress", 'expected' => "\xccord\xe3ress", + 'length' => array( 'type' => 'char', 'length' => 100 ), + ), + 'tis620_char_length' => array( + 'charset' => 'tis620', + 'value' => str_repeat( "\xcc\xe3", 10 ), + 'expected' => str_repeat( "\xcc\xe3", 5 ), + 'length' => array( 'type' => 'char', 'length' => 10 ), + ), + 'tis620_byte_length' => array( + 'charset' => 'tis620', + 'value' => str_repeat( "\xcc\xe3", 10 ), + 'expected' => str_repeat( "\xcc\xe3", 5 ), + 'length' => array( 'type' => 'byte', 'length' => 10 ), ), 'false' => array( // false is a column with no character set (ie, a number column) 'charset' => false, 'value' => 100, - 'expected' => 100 + 'expected' => 100, + 'length' => false, ), ); @@ -94,7 +265,22 @@ class Tests_DB_Charset extends WP_UnitTestCase { $fields['big5'] = array( 'charset' => 'big5', 'value' => $big5, - 'expected' => $big5 + 'expected' => $big5, + 'length' => array( 'type' => 'char', 'length' => 100 ), + ); + + $fields['big5_char_length'] = array( + 'charset' => 'big5', + 'value' => str_repeat( $big5, 10 ), + 'expected' => str_repeat( $big5, 3 ) . 'a', + 'length' => array( 'type' => 'char', 'length' => 10 ), + ); + + $fields['big5_byte_length'] = array( + 'charset' => 'big5', + 'value' => str_repeat( $big5, 10 ), + 'expected' => str_repeat( $big5, 2 ) . 'a', + 'length' => array( 'type' => 'byte', 'length' => 10 ), ); } @@ -170,14 +356,14 @@ class Tests_DB_Charset extends WP_UnitTestCase { ); $all_ascii_fields = array( - 'post_content' => array( 'value' => 'foo foo foo!', 'format' => '%s', 'charset' => false ), - 'post_excerpt' => array( 'value' => 'bar bar bar!', 'format' => '%s', 'charset' => false ), + 'post_content' => array( 'value' => 'foo foo foo!', 'format' => '%s', 'charset' => $charset ), + 'post_excerpt' => array( 'value' => 'bar bar bar!', 'format' => '%s', 'charset' => $charset ), ); // This is the same data used in process_field_charsets_for_nonexistent_table() $non_ascii_string_fields = array( - 'post_content' => array( 'value' => '¡foo foo foo!', 'format' => '%s', 'charset' => $charset, 'ascii' => false ), - 'post_excerpt' => array( 'value' => '¡bar bar bar!', 'format' => '%s', 'charset' => $charset, 'ascii' => false ), + 'post_content' => array( 'value' => '¡foo foo foo!', 'format' => '%s', 'charset' => $charset ), + 'post_excerpt' => array( 'value' => '¡bar bar bar!', 'format' => '%s', 'charset' => $charset ), ); $vars = get_defined_vars(); @@ -544,4 +730,16 @@ class Tests_DB_Charset extends WP_UnitTestCase { self::$_wpdb->query( $drop ); } + + function test_strip_invalid_test_for_column_bails_if_ascii_input_too_long() { + global $wpdb; + + // TEXT column + $stripped = $wpdb->strip_invalid_text_for_column( $wpdb->comments, 'comment_content', str_repeat( 'A', 65536 ) ); + $this->assertEquals( 65535, strlen( $stripped ) ); + + // VARCHAR column + $stripped = $wpdb->strip_invalid_text_for_column( $wpdb->comments, 'comment_agent', str_repeat( 'A', 256 ) ); + $this->assertEquals( 255, strlen( $stripped ) ); + } }