From fb28a78331016bb5850b63a4935b6285176cc21a Mon Sep 17 00:00:00 2001 From: Andrew Nacin Date: Thu, 20 Nov 2014 14:27:08 +0000 Subject: [PATCH] Anchor texturize to shortcodes to improve regex efficiency. props miqrogroove. see #29557 for segfault issues. git-svn-id: https://develop.svn.wordpress.org/trunk@30449 602fd350-edb4-49c9-b593-d223f7449a82 --- src/wp-includes/formatting.php | 22 +++++++++++++------ .../phpunit/tests/formatting/WPTexturize.php | 16 +++++++------- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php index 57cfcb1eac..5209124c4d 100644 --- a/src/wp-includes/formatting.php +++ b/src/wp-includes/formatting.php @@ -28,7 +28,7 @@ * @return string The string replaced with html entities */ function wptexturize($text, $reset = false) { - global $wp_cockneyreplace; + global $wp_cockneyreplace, $shortcode_tags; static $static_characters, $static_replacements, $dynamic_characters, $dynamic_replacements, $default_no_texturize_tags, $default_no_texturize_shortcodes, $run_texturize = true; @@ -205,6 +205,10 @@ function wptexturize($text, $reset = false) { // Look for shortcodes and HTML elements. + $tagnames = array_keys( $shortcode_tags ); + $tagregexp = join( '|', array_map( 'preg_quote', $tagnames ) ); + $tagregexp = "(?:$tagregexp)(?![\\w-])"; // Excerpt of get_shortcode_regex(). + $comment_regex = '!' // Start of comment, after the <. . '(?:' // Unroll the loop: Consume everything until --> is found. @@ -214,12 +218,16 @@ function wptexturize($text, $reset = false) { . '(?:-->)?'; // End of comment. If not found, match all input. $shortcode_regex = - '\[' // Find start of shortcode. - . '[\/\[]?' // Shortcodes may begin with [/ or [[ - . '[^\s\/\[\]]' // No whitespace before name. - . '[^\[\]]*+' // Shortcodes do not contain other shortcodes. Possessive critical. - . '\]' // Find end of shortcode. - . '\]?'; // Shortcodes may end with ]] + '\[' // Find start of shortcode. + . '[\/\[]?' // Shortcodes may begin with [/ or [[ + . $tagregexp // Only match registered shortcodes, because performance. + . '(?:' + . '[^\[\]<>]+' // Shortcodes do not contain other shortcodes. Quantifier critical. + . '|' + . '<[^\[\]>]*>' // HTML elements permitted. Prevents matching ] before >. + . ')*+' // Possessive critical. + . '\]' // Find end of shortcode. + . '\]?'; // Shortcodes may end with ]] $regex = '/(' // Capture the entire match. diff --git a/tests/phpunit/tests/formatting/WPTexturize.php b/tests/phpunit/tests/formatting/WPTexturize.php index 1bc197bfef..1187f1eb52 100644 --- a/tests/phpunit/tests/formatting/WPTexturize.php +++ b/tests/phpunit/tests/formatting/WPTexturize.php @@ -1196,12 +1196,12 @@ class Tests_Formatting_WPTexturize extends WP_UnitTestCase { '[ is it wise to maybe ]', ), array( - '[is it wise to maybe ]', // HTML corruption is a known bug. See tickets #12690 and #29557. - '[is it wise to maybe ]', + '[is it wise to maybe ]', ), array( '[caption - is it wise to maybe ]', - '[caption - is it wise to maybe ]', ), array( '[ photos by this guy ]', @@ -1767,23 +1767,23 @@ class Tests_Formatting_WPTexturize extends WP_UnitTestCase { ), array( '[code ...]...[/code]', // code is not a registered shortcode. - '[code ...]...[/code]', + '[code …]…[/code]', ), array( '[hello ...]...[/hello]', // hello is not a registered shortcode. - '[hello ...]…[/hello]', + '[hello …]…[/hello]', ), array( '[...]...[/...]', // These are potentially usable shortcodes. - '[...]…[/...]', + '[…]…[/…]', ), array( '[gal>ery ...]', - '[gal>ery ...]', + '[gal>ery …]', ), array( '[randomthing param="test"]', - '[randomthing param="test"]', + '[randomthing param=”test”]', ), array( '[[audio]...[/audio]...', // These are potentially usable shortcodes. Unfortunately, the meaning of [[audio] is ambiguous unless we run the entire shortcode regexp.