Anchor texturize to shortcodes to improve regex efficiency.

props miqrogroove.
see #29557 for segfault issues.


git-svn-id: https://develop.svn.wordpress.org/trunk@30449 602fd350-edb4-49c9-b593-d223f7449a82
This commit is contained in:
Andrew Nacin 2014-11-20 14:27:08 +00:00
parent 5db0ce11fb
commit fb28a78331
2 changed files with 23 additions and 15 deletions

View File

@ -28,7 +28,7 @@
* @return string The string replaced with html entities * @return string The string replaced with html entities
*/ */
function wptexturize($text, $reset = false) { function wptexturize($text, $reset = false) {
global $wp_cockneyreplace; global $wp_cockneyreplace, $shortcode_tags;
static $static_characters, $static_replacements, $dynamic_characters, $dynamic_replacements, static $static_characters, $static_replacements, $dynamic_characters, $dynamic_replacements,
$default_no_texturize_tags, $default_no_texturize_shortcodes, $run_texturize = true; $default_no_texturize_tags, $default_no_texturize_shortcodes, $run_texturize = true;
@ -205,6 +205,10 @@ function wptexturize($text, $reset = false) {
// Look for shortcodes and HTML elements. // Look for shortcodes and HTML elements.
$tagnames = array_keys( $shortcode_tags );
$tagregexp = join( '|', array_map( 'preg_quote', $tagnames ) );
$tagregexp = "(?:$tagregexp)(?![\\w-])"; // Excerpt of get_shortcode_regex().
$comment_regex = $comment_regex =
'!' // Start of comment, after the <. '!' // Start of comment, after the <.
. '(?:' // Unroll the loop: Consume everything until --> is found. . '(?:' // Unroll the loop: Consume everything until --> is found.
@ -214,12 +218,16 @@ function wptexturize($text, $reset = false) {
. '(?:-->)?'; // End of comment. If not found, match all input. . '(?:-->)?'; // End of comment. If not found, match all input.
$shortcode_regex = $shortcode_regex =
'\[' // Find start of shortcode. '\[' // Find start of shortcode.
. '[\/\[]?' // Shortcodes may begin with [/ or [[ . '[\/\[]?' // Shortcodes may begin with [/ or [[
. '[^\s\/\[\]]' // No whitespace before name. . $tagregexp // Only match registered shortcodes, because performance.
. '[^\[\]]*+' // Shortcodes do not contain other shortcodes. Possessive critical. . '(?:'
. '\]' // Find end of shortcode. . '[^\[\]<>]+' // Shortcodes do not contain other shortcodes. Quantifier critical.
. '\]?'; // Shortcodes may end with ]] . '|'
. '<[^\[\]>]*>' // HTML elements permitted. Prevents matching ] before >.
. ')*+' // Possessive critical.
. '\]' // Find end of shortcode.
. '\]?'; // Shortcodes may end with ]]
$regex = $regex =
'/(' // Capture the entire match. '/(' // Capture the entire match.

View File

@ -1196,12 +1196,12 @@ class Tests_Formatting_WPTexturize extends WP_UnitTestCase {
'[ is it wise to <a title="allow user content ] here? hmm"> maybe </a> ]', '[ is it wise to <a title="allow user content ] here? hmm"> maybe </a> ]',
), ),
array( array(
'[is it wise to <a title="allow user content ] here? hmm"> maybe </a> ]', // HTML corruption is a known bug. See tickets #12690 and #29557. '[is it wise to <a title="allow user content ] here? hmm"> maybe </a> ]',
'[is it wise to <a title="allow user content ] here? hmm&#8221;> maybe </a> ]', '[is it wise to <a title="allow user content ] here? hmm"> maybe </a> ]',
), ),
array( array(
'[caption - is it wise to <a title="allow user content ] here? hmm"> maybe </a> ]', '[caption - is it wise to <a title="allow user content ] here? hmm"> maybe </a> ]',
'[caption - is it wise to <a title="allow user content ] here? hmm&#8221;> maybe </a> ]', '[caption &#8211; is it wise to <a title="allow user content ] here? hmm"> maybe </a> ]',
), ),
array( array(
'[ photos by <a href="http://example.com/?a[]=1&a[]=2"> this guy </a> ]', '[ photos by <a href="http://example.com/?a[]=1&a[]=2"> this guy </a> ]',
@ -1767,23 +1767,23 @@ class Tests_Formatting_WPTexturize extends WP_UnitTestCase {
), ),
array( array(
'[code ...]...[/code]', // code is not a registered shortcode. '[code ...]...[/code]', // code is not a registered shortcode.
'[code ...]...[/code]', '[code &#8230;]&#8230;[/code]',
), ),
array( array(
'[hello ...]...[/hello]', // hello is not a registered shortcode. '[hello ...]...[/hello]', // hello is not a registered shortcode.
'[hello ...]&#8230;[/hello]', '[hello &#8230;]&#8230;[/hello]',
), ),
array( array(
'[...]...[/...]', // These are potentially usable shortcodes. '[...]...[/...]', // These are potentially usable shortcodes.
'[...]&#8230;[/...]', '[&#8230;]&#8230;[/&#8230;]',
), ),
array( array(
'[gal>ery ...]', '[gal>ery ...]',
'[gal>ery ...]', '[gal>ery &#8230;]',
), ),
array( array(
'[randomthing param="test"]', '[randomthing param="test"]',
'[randomthing param="test"]', '[randomthing param=&#8221;test&#8221;]',
), ),
array( array(
'[[audio]...[/audio]...', // These are potentially usable shortcodes. Unfortunately, the meaning of [[audio] is ambiguous unless we run the entire shortcode regexp. '[[audio]...[/audio]...', // These are potentially usable shortcodes. Unfortunately, the meaning of [[audio] is ambiguous unless we run the entire shortcode regexp.