The joys of `wptexturize()`:

* Revert parts of [28773] and [28727] and [29748].
* Do not crash PHP. Make the shortcode quantifier possessive to avoid backtracks.
* Reduce backtracking in long HTML comments by 100x.
* Do not ignore unclosed HTML comments.
* Do not break unregistered shortcodes, e.g. `[hello attr="value"]`.
* Do not break HTML in shortcode attributes, e.g. `[hello attr="<"]`.
* Do not match for shortcodes when there is extra whitespace, e.g. `[ hello ]`.
* Add unit tests to show #12690 was not fully resolved.
* Tested PHP 5.2.4, 5.2.13, 5.4.32, and 5.5.8.

Adds/modifies unit tests.

Props miqrogroove.
See #29557.


git-svn-id: https://develop.svn.wordpress.org/trunk@29781 602fd350-edb4-49c9-b593-d223f7449a82
This commit is contained in:
Scott Taylor 2014-09-29 04:06:54 +00:00
parent 18adbb6439
commit 23f7f53be2
3 changed files with 100 additions and 38 deletions

View File

@ -28,7 +28,7 @@
* @return string The string replaced with html entities * @return string The string replaced with html entities
*/ */
function wptexturize($text, $reset = false) { function wptexturize($text, $reset = false) {
global $wp_cockneyreplace, $shortcode_tags; global $wp_cockneyreplace;
static $static_characters, $static_replacements, $dynamic_characters, $dynamic_replacements, static $static_characters, $static_replacements, $dynamic_characters, $dynamic_replacements,
$default_no_texturize_tags, $default_no_texturize_shortcodes, $run_texturize = true; $default_no_texturize_tags, $default_no_texturize_shortcodes, $run_texturize = true;
@ -205,45 +205,55 @@ function wptexturize($text, $reset = false) {
// Look for shortcodes and HTML elements. // Look for shortcodes and HTML elements.
$tagnames = array_keys( $shortcode_tags ); $comment_regex =
$tagregexp = join( '|', array_map( 'preg_quote', $tagnames ) ); '!' // Start of comment, after the <.
$tagregexp = "(?:$tagregexp)(?![\\w-])"; // Excerpt of get_shortcode_regex(). . '(?:' // Unroll the loop: Consume everything until --> is found.
. '-(?!->)' // Dash not followed by end of comment.
. '[^\-]*+' // Consume non-dashes.
. ')*+' // Loop possessively.
. '(?:-->)?'; // End of comment. If not found, match all input.
$regex = '/(' // Capture the entire match. $shortcode_regex =
. '<' // Find start of element. '\[' // Find start of shortcode.
. '(?(?=!--)' // Is this a comment? . '[\/\[]?' // Shortcodes may begin with [/ or [[
. '.+?--\s*>' // Find end of comment . '[^\s\/\[\]]' // No whitespace before name.
. '[^\[\]]*+' // Shortcodes do not contain other shortcodes. Possessive critical.
. '\]' // Find end of shortcode.
. '\]?'; // Shortcodes may end with ]]
$regex =
'/(' // Capture the entire match.
. '<' // Find start of element.
. '(?(?=!--)' // Is this a comment?
. $comment_regex // Find end of comment.
. '|'
. '[^>]+>' // Find end of element.
. ')'
. '|' . '|'
. '[^>]+>' // Find end of element . $shortcode_regex // Find shortcodes.
. ')' . ')/s';
. '|'
. '\[' // Find start of shortcode.
. '\[?' // Shortcodes may begin with [[
. '\/?' // Closing slash may precede name.
. $tagregexp // Only match registered shortcodes, because performance.
. '[^\[\]]*' // Shortcodes do not contain other shortcodes.
. '\]' // Find end of shortcode.
. '\]?' // Shortcodes may end with ]]
. ')/s';
$textarr = preg_split( $regex, $text, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ); $textarr = preg_split( $regex, $text, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY );
foreach ( $textarr as &$curl ) { foreach ( $textarr as &$curl ) {
// Only call _wptexturize_pushpop_element if $curl is a delimiter. // Only call _wptexturize_pushpop_element if $curl is a delimiter.
$first = $curl[0]; $first = $curl[0];
if ( '<' === $first && '>' === substr( $curl, -1 ) ) { if ( '<' === $first && '<!--' === substr( $curl, 0, 4 ) ) {
// This is an HTML delimiter. // This is an HTML comment delimeter.
if ( '<!--' !== substr( $curl, 0, 4 ) ) { continue;
_wptexturize_pushpop_element( $curl, $no_texturize_tags_stack, $no_texturize_tags );
} } elseif ( '<' === $first && '>' === substr( $curl, -1 ) ) {
// This is an HTML element delimiter.
_wptexturize_pushpop_element( $curl, $no_texturize_tags_stack, $no_texturize_tags );
} elseif ( '' === trim( $curl ) ) { } elseif ( '' === trim( $curl ) ) {
// This is a newline between delimiters. Performance improves when we check this. // This is a newline between delimiters. Performance improves when we check this.
continue; continue;
} elseif ( '[' === $first && 1 === preg_match( '/^\[\[?\/?' . $tagregexp . '[^\[\]]*\]\]?$/', $curl ) ) { } elseif ( '[' === $first && 1 === preg_match( '/^' . $shortcode_regex . '$/', $curl ) ) {
// This is a shortcode delimiter. // This is a shortcode delimiter.
if ( '[[' !== substr( $curl, 0, 2 ) && ']]' !== substr( $curl, -2 ) ) { if ( '[[' !== substr( $curl, 0, 2 ) && ']]' !== substr( $curl, -2 ) ) {

View File

@ -231,7 +231,7 @@ function get_shortcode_regex() {
$tagregexp = join( '|', array_map('preg_quote', $tagnames) ); $tagregexp = join( '|', array_map('preg_quote', $tagnames) );
// WARNING! Do not change this regex without changing do_shortcode_tag() and strip_shortcode_tag() // WARNING! Do not change this regex without changing do_shortcode_tag() and strip_shortcode_tag()
// Also, see shortcode_unautop() and shortcode.js and wptexturize(). // Also, see shortcode_unautop() and shortcode.js.
return return
'\\[' // Opening bracket '\\[' // Opening bracket
. '(\\[?)' // 1: Optional second opening bracket for escaping shortcodes: [[tag]] . '(\\[?)' // 1: Optional second opening bracket for escaping shortcodes: [[tag]]

View File

@ -1187,14 +1187,30 @@ class Tests_Formatting_WPTexturize extends WP_UnitTestCase {
function data_tag_avoidance() { function data_tag_avoidance() {
return array( return array(
array(
'[ ... ]',
'[ &#8230; ]',
),
array( array(
'[ is it wise to <a title="allow user content ] here? hmm"> maybe </a> ]', '[ is it wise to <a title="allow user content ] here? hmm"> maybe </a> ]',
'[ is it wise to <a title="allow user content ] here? hmm"> maybe </a> ]', '[ is it wise to <a title="allow user content ] here? hmm"> maybe </a> ]',
), ),
array(
'[is it wise to <a title="allow user content ] here? hmm"> maybe </a> ]', // HTML corruption is a known bug. See tickets #12690 and #29557.
'[is it wise to <a title="allow user content ] here? hmm&#8221;> maybe </a> ]',
),
array(
'[caption - is it wise to <a title="allow user content ] here? hmm"> maybe </a> ]',
'[caption - is it wise to <a title="allow user content ] here? hmm&#8221;> maybe </a> ]',
),
array( array(
'[ photos by <a href="http://example.com/?a[]=1&a[]=2"> this guy </a> ]', '[ photos by <a href="http://example.com/?a[]=1&a[]=2"> this guy </a> ]',
'[ photos by <a href="http://example.com/?a[]=1&#038;a[]=2"> this guy </a> ]', '[ photos by <a href="http://example.com/?a[]=1&#038;a[]=2"> this guy </a> ]',
), ),
array(
'[photos by <a href="http://example.com/?a[]=1&a[]=2"> this guy </a>]',
'[photos by <a href="http://example.com/?a[]=1&#038;a[]=2"> this guy </a>]',
),
array( array(
'[gallery ...]', '[gallery ...]',
'[gallery ...]', '[gallery ...]',
@ -1211,10 +1227,6 @@ class Tests_Formatting_WPTexturize extends WP_UnitTestCase {
'[/gallery ...]', // This would actually be ignored by the shortcode system. The decision to not texturize it is intentional, if not correct. '[/gallery ...]', // This would actually be ignored by the shortcode system. The decision to not texturize it is intentional, if not correct.
'[/gallery ...]', '[/gallery ...]',
), ),
array(
'[...]...[/...]', // These are potentially usable shortcodes.
'[&#8230;]&#8230;[/&#8230;]',
),
array( array(
'[[gallery]]...[[/gallery]]', // Shortcode parsing will ignore the inner ]...[ part and treat this as a single escaped shortcode. '[[gallery]]...[[/gallery]]', // Shortcode parsing will ignore the inner ]...[ part and treat this as a single escaped shortcode.
'[[gallery]]&#8230;[[/gallery]]', '[[gallery]]&#8230;[[/gallery]]',
@ -1223,10 +1235,6 @@ class Tests_Formatting_WPTexturize extends WP_UnitTestCase {
'[[[gallery]]]...[[[/gallery]]]', // Again, shortcode parsing matches, but only the [[gallery] and [/gallery]] parts. '[[[gallery]]]...[[[/gallery]]]', // Again, shortcode parsing matches, but only the [[gallery] and [/gallery]] parts.
'[[[gallery]]]&#8230;[[[/gallery]]]', '[[[gallery]]]&#8230;[[[/gallery]]]',
), ),
array(
'[gal>ery ...]',
'[gal>ery &#8230;]',
),
array( array(
'[gallery ...', '[gallery ...',
'[gallery &#8230;', '[gallery &#8230;',
@ -1300,8 +1308,40 @@ class Tests_Formatting_WPTexturize extends WP_UnitTestCase {
'<!--...-->', '<!--...-->',
), ),
array( array(
'<!-- ... -- >', '<!-- ... -- > ...',
'<!-- ... -- >', '<!-- ... -- > ...',
),
array(
'<!-- ...', // An unclosed comment is still a comment.
'<!-- ...',
),
array(
'a<!-->b', // Browsers seem to allow this.
'a<!-->b',
),
array(
'a<!--->b',
'a<!--->b',
),
array(
'a<!---->b',
'a<!---->b',
),
array(
'a<!----->b',
'a<!----->b',
),
array(
'a<!-- c --->b',
'a<!-- c --->b',
),
array(
'a<!-- c -- d -->b',
'a<!-- c -- d -->b',
),
array(
'a<!-- <!-- c --> -->b<!-- close -->',
'a<!-- <!-- c --> &#8211;>b<!-- close -->',
), ),
array( array(
'<!-- <br /> [gallery] ... -->', '<!-- <br /> [gallery] ... -->',
@ -1727,11 +1767,23 @@ class Tests_Formatting_WPTexturize extends WP_UnitTestCase {
), ),
array( array(
'[code ...]...[/code]', // code is not a registered shortcode. '[code ...]...[/code]', // code is not a registered shortcode.
'[code &#8230;]&#8230;[/code]', '[code ...]...[/code]',
), ),
array( array(
'[hello ...]...[/hello]', // hello is not a registered shortcode. '[hello ...]...[/hello]', // hello is not a registered shortcode.
'[hello &#8230;]&#8230;[/hello]', '[hello ...]&#8230;[/hello]',
),
array(
'[...]...[/...]', // These are potentially usable shortcodes.
'[...]&#8230;[/...]',
),
array(
'[gal>ery ...]',
'[gal>ery ...]',
),
array(
'[randomthing param="test"]',
'[randomthing param="test"]',
), ),
array( array(
'[[audio]...[/audio]...', // These are potentially usable shortcodes. Unfortunately, the meaning of [[audio] is ambiguous unless we run the entire shortcode regexp. '[[audio]...[/audio]...', // These are potentially usable shortcodes. Unfortunately, the meaning of [[audio] is ambiguous unless we run the entire shortcode regexp.