From 23f7f53be2b3b7902e53f875016c278cd8cc2f84 Mon Sep 17 00:00:00 2001 From: Scott Taylor Date: Mon, 29 Sep 2014 04:06:54 +0000 Subject: [PATCH] The joys of `wptexturize()`: * Revert parts of [28773] and [28727] and [29748]. * Do not crash PHP. Make the shortcode quantifier possessive to avoid backtracks. * Reduce backtracking in long HTML comments by 100x. * Do not ignore unclosed HTML comments. * Do not break unregistered shortcodes, e.g. `[hello attr="value"]`. * Do not break HTML in shortcode attributes, e.g. `[hello attr="<"]`. * Do not match for shortcodes when there is extra whitespace, e.g. `[ hello ]`. * Add unit tests to show #12690 was not fully resolved. * Tested PHP 5.2.4, 5.2.13, 5.4.32, and 5.5.8. Adds/modifies unit tests. Props miqrogroove. See #29557. git-svn-id: https://develop.svn.wordpress.org/trunk@29781 602fd350-edb4-49c9-b593-d223f7449a82 --- src/wp-includes/formatting.php | 60 +++++++++------ src/wp-includes/shortcodes.php | 2 +- .../phpunit/tests/formatting/WPTexturize.php | 76 ++++++++++++++++--- 3 files changed, 100 insertions(+), 38 deletions(-) diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php index 0f204b8931..b7b275222a 100644 --- a/src/wp-includes/formatting.php +++ b/src/wp-includes/formatting.php @@ -28,7 +28,7 @@ * @return string The string replaced with html entities */ function wptexturize($text, $reset = false) { - global $wp_cockneyreplace, $shortcode_tags; + global $wp_cockneyreplace; static $static_characters, $static_replacements, $dynamic_characters, $dynamic_replacements, $default_no_texturize_tags, $default_no_texturize_shortcodes, $run_texturize = true; @@ -205,45 +205,55 @@ function wptexturize($text, $reset = false) { // Look for shortcodes and HTML elements. - $tagnames = array_keys( $shortcode_tags ); - $tagregexp = join( '|', array_map( 'preg_quote', $tagnames ) ); - $tagregexp = "(?:$tagregexp)(?![\\w-])"; // Excerpt of get_shortcode_regex(). + $comment_regex = + '!' // Start of comment, after the <. + . '(?:' // Unroll the loop: Consume everything until --> is found. + . '-(?!->)' // Dash not followed by end of comment. + . '[^\-]*+' // Consume non-dashes. + . ')*+' // Loop possessively. + . '(?:-->)?'; // End of comment. If not found, match all input. - $regex = '/(' // Capture the entire match. - . '<' // Find start of element. - . '(?(?=!--)' // Is this a comment? - . '.+?--\s*>' // Find end of comment + $shortcode_regex = + '\[' // Find start of shortcode. + . '[\/\[]?' // Shortcodes may begin with [/ or [[ + . '[^\s\/\[\]]' // No whitespace before name. + . '[^\[\]]*+' // Shortcodes do not contain other shortcodes. Possessive critical. + . '\]' // Find end of shortcode. + . '\]?'; // Shortcodes may end with ]] + + $regex = + '/(' // Capture the entire match. + . '<' // Find start of element. + . '(?(?=!--)' // Is this a comment? + . $comment_regex // Find end of comment. + . '|' + . '[^>]+>' // Find end of element. + . ')' . '|' - . '[^>]+>' // Find end of element - . ')' - . '|' - . '\[' // Find start of shortcode. - . '\[?' // Shortcodes may begin with [[ - . '\/?' // Closing slash may precede name. - . $tagregexp // Only match registered shortcodes, because performance. - . '[^\[\]]*' // Shortcodes do not contain other shortcodes. - . '\]' // Find end of shortcode. - . '\]?' // Shortcodes may end with ]] - . ')/s'; + . $shortcode_regex // Find shortcodes. + . ')/s'; $textarr = preg_split( $regex, $text, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ); foreach ( $textarr as &$curl ) { // Only call _wptexturize_pushpop_element if $curl is a delimiter. $first = $curl[0]; - if ( '<' === $first && '>' === substr( $curl, -1 ) ) { - // This is an HTML delimiter. + if ( '<' === $first && '', ), array( - 'b', // Browsers seem to allow this. + 'ab', + ), + array( + 'ab', + 'ab', + ), + array( + 'ab', + 'ab', + ), + array( + 'ab', + 'ab', + ), + array( + 'ab', + 'ab', + ), + array( + 'ab', + 'ab', + ), + array( + 'a -->b', + 'a –>b', ), array( '', @@ -1727,11 +1767,23 @@ class Tests_Formatting_WPTexturize extends WP_UnitTestCase { ), array( '[code ...]...[/code]', // code is not a registered shortcode. - '[code …]…[/code]', + '[code ...]...[/code]', ), array( '[hello ...]...[/hello]', // hello is not a registered shortcode. - '[hello …]…[/hello]', + '[hello ...]…[/hello]', + ), + array( + '[...]...[/...]', // These are potentially usable shortcodes. + '[...]…[/...]', + ), + array( + '[gal>ery ...]', + '[gal>ery ...]', + ), + array( + '[randomthing param="test"]', + '[randomthing param="test"]', ), array( '[[audio]...[/audio]...', // These are potentially usable shortcodes. Unfortunately, the meaning of [[audio] is ambiguous unless we run the entire shortcode regexp.