Shortcodes/Formatting: Add PCRE Performance Testing

* Move pattern from `wptexturize()` into a separate function.
* Move pattern from `wp_html_split()` into a separate function.
* Beautify code for `wp_html_split()`.
* Remove unnecessary instances of `/s` modifier in patterns that don't use dots.
* Add `tests/phpunit/data/formatting/whole-posts.php` for testing larger strings.
* Add function `benchmark_pcre_backtracking()`.
* Add tests for `wp_html_split()`.
* Add tests for `wptexturize()`.
* Add tests for `get_shortcode_regex()`.

Props miqrogroove.
Fixes #34121.


git-svn-id: https://develop.svn.wordpress.org/trunk@34761 602fd350-edb4-49c9-b593-d223f7449a82
This commit is contained in:
Scott Taylor 2015-10-02 04:25:40 +00:00
parent c152375b58
commit 5a24a0a4f8
6 changed files with 1488 additions and 55 deletions

View File

@ -219,43 +219,8 @@ function wptexturize( $text, $reset = false ) {
preg_match_all( '@\[/?([^<>&/\[\]\x00-\x20]++)@', $text, $matches );
$tagnames = array_intersect( array_keys( $shortcode_tags ), $matches[1] );
$found_shortcodes = ! empty( $tagnames );
if ( $found_shortcodes ) {
$tagregexp = join( '|', array_map( 'preg_quote', $tagnames ) );
$tagregexp = "(?:$tagregexp)(?![\\w-])"; // Excerpt of get_shortcode_regex().
$shortcode_regex =
'\[' // Find start of shortcode.
. '[\/\[]?' // Shortcodes may begin with [/ or [[
. $tagregexp // Only match registered shortcodes, because performance.
. '(?:'
. '[^\[\]<>]+' // Shortcodes do not contain other shortcodes. Quantifier critical.
. '|'
. '<[^\[\]>]*>' // HTML elements permitted. Prevents matching ] before >.
. ')*+' // Possessive critical.
. '\]' // Find end of shortcode.
. '\]?'; // Shortcodes may end with ]]
}
$comment_regex =
'!' // Start of comment, after the <.
. '(?:' // Unroll the loop: Consume everything until --> is found.
. '-(?!->)' // Dash not followed by end of comment.
. '[^\-]*+' // Consume non-dashes.
. ')*+' // Loop possessively.
. '(?:-->)?'; // End of comment. If not found, match all input.
$html_regex = // Needs replaced with wp_html_split() per Shortcode API Roadmap.
'<' // Find start of element.
. '(?(?=!--)' // Is this a comment?
. $comment_regex // Find end of comment.
. '|'
. '[^>]*>?' // Find end of element. If not found, match all input.
. ')';
if ( $found_shortcodes ) {
$regex = '/(' . $html_regex . '|' . $shortcode_regex . ')/s';
} else {
$regex = '/(' . $html_regex . ')/s';
}
$shortcode_regex = $found_shortcodes ? _get_wptexturize_shortcode_regex( $tagnames ) : '';
$regex = _get_wptexturize_split_regex( $shortcode_regex );
$textarr = preg_split( $regex, $text, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY );
@ -264,7 +229,7 @@ function wptexturize( $text, $reset = false ) {
$first = $curl[0];
if ( '<' === $first ) {
if ( '<!--' === substr( $curl, 0, 4 ) ) {
// This is an HTML comment delimeter.
// This is an HTML comment delimiter.
continue;
} else {
// This is an HTML element delimiter.
@ -615,6 +580,17 @@ function wpautop( $pee, $br = true ) {
* @return array The formatted text.
*/
function wp_html_split( $input ) {
return preg_split( get_html_split_regex(), $input, -1, PREG_SPLIT_DELIM_CAPTURE );
}
/**
* Retrieve the regular expression for an HTML element.
*
* @since 4.4.0
*
* @return string The regular expression
*/
function get_html_split_regex() {
static $regex;
if ( ! isset( $regex ) ) {
@ -635,22 +611,100 @@ function wp_html_split( $input ) {
. ')*+' // Loop possessively.
. '(?:]]>)?'; // End of comment. If not found, match all input.
$escaped =
'(?=' // Is the element escaped?
. '!--'
. '|'
. '!\[CDATA\['
. ')'
. '(?(?=!-)' // If yes, which type?
. $comments
. '|'
. $cdata
. ')';
$regex =
'/(' // Capture the entire match.
. '<' // Find start of element.
. '(?(?=!--)' // Is this a comment?
. $comments // Find end of comment.
. '|'
. '(?(?=!\[CDATA\[)' // Is this a comment?
. $cdata // Find end of comment.
. '|'
. '[^>]*>?' // Find end of element. If not found, match all input.
. '(?' // Conditional expression follows.
. $escaped // Find end of escaped element.
. '|' // ... else ...
. '[^>]*>?' // Find end of normal element.
. ')'
. ')'
. ')/s';
. ')/';
}
return preg_split( $regex, $input, -1, PREG_SPLIT_DELIM_CAPTURE );
return $regex;
}
/**
* Retrieve the combined regular expression for HTML and shortcodes.
*
* @access private
* @ignore
* @internal This function will be removed in 4.5.0 per Shortcode API Roadmap.
* @since 4.4.0
*
* @param string $shortcode_regex The result from _get_wptexturize_shortcode_regex(). Optional.
* @return string The regular expression
*/
function _get_wptexturize_split_regex( $shortcode_regex = '' ) {
static $html_regex;
if ( ! isset( $html_regex ) ) {
$comment_regex =
'!' // Start of comment, after the <.
. '(?:' // Unroll the loop: Consume everything until --> is found.
. '-(?!->)' // Dash not followed by end of comment.
. '[^\-]*+' // Consume non-dashes.
. ')*+' // Loop possessively.
. '(?:-->)?'; // End of comment. If not found, match all input.
$html_regex = // Needs replaced with wp_html_split() per Shortcode API Roadmap.
'<' // Find start of element.
. '(?(?=!--)' // Is this a comment?
. $comment_regex // Find end of comment.
. '|'
. '[^>]*>?' // Find end of element. If not found, match all input.
. ')';
}
if ( empty( $shortcode_regex ) ) {
$regex = '/(' . $html_regex . ')/';
} else {
$regex = '/(' . $html_regex . '|' . $shortcode_regex . ')/';
}
return $regex;
}
/**
* Retrieve the regular expression for shortcodes.
*
* @access private
* @ignore
* @internal This function will be removed in 4.5.0 per Shortcode API Roadmap.
* @since 4.4.0
*
* @param array $tagnames List of shortcodes to find.
* @return string The regular expression
*/
function _get_wptexturize_shortcode_regex( $tagnames ) {
$tagregexp = join( '|', array_map( 'preg_quote', $tagnames ) );
$tagregexp = "(?:$tagregexp)(?=[\\s\\]\\/])"; // Excerpt of get_shortcode_regex().
$regex =
'\[' // Find start of shortcode.
. '[\/\[]?' // Shortcodes may begin with [/ or [[
. $tagregexp // Only match registered shortcodes, because performance.
. '(?:'
. '[^\[\]<>]+' // Shortcodes do not contain other shortcodes. Quantifier critical.
. '|'
. '<[^\[\]>]*>' // HTML elements permitted. Prevents matching ] before >.
. ')*+' // Possessive critical.
. '\]' // Find end of shortcode.
. '\]?'; // Shortcodes may end with ]]
return $regex;
}
/**
@ -768,7 +822,7 @@ function shortcode_unautop( $pee ) {
. ')'
. '(?:' . $spaces . ')*+' // optional trailing whitespace
. '<\\/p>' // closing paragraph
. '/s';
. '/';
return preg_replace( $pattern, '$1', $pee );
}

View File

@ -168,7 +168,7 @@ function has_shortcode( $content, $tag ) {
}
if ( shortcode_exists( $tag ) ) {
preg_match_all( '/' . get_shortcode_regex() . '/s', $content, $matches, PREG_SET_ORDER );
preg_match_all( '/' . get_shortcode_regex() . '/', $content, $matches, PREG_SET_ORDER );
if ( empty( $matches ) )
return false;
@ -219,7 +219,7 @@ function do_shortcode( $content, $ignore_html = false ) {
$content = do_shortcodes_in_html_tags( $content, $ignore_html, $tagnames );
$pattern = get_shortcode_regex( $tagnames );
$content = preg_replace_callback( "/$pattern/s", 'do_shortcode_tag', $content );
$content = preg_replace_callback( "/$pattern/", 'do_shortcode_tag', $content );
// Always restore square braces so we don't break things like <!--[if IE ]>
$content = unescape_invalid_shortcodes( $content );
@ -378,7 +378,7 @@ function do_shortcodes_in_html_tags( $content, $ignore_html, $tagnames ) {
if ( false === $attributes ) {
// Some plugins are doing things like [name] <[email]>.
if ( 1 === preg_match( '%^<\s*\[\[?[^\[\]]+\]%', $element ) ) {
$element = preg_replace_callback( "/$pattern/s", 'do_shortcode_tag', $element );
$element = preg_replace_callback( "/$pattern/", 'do_shortcode_tag', $element );
}
// Looks like we found some crazy unfiltered HTML. Skipping it for sanity.
@ -407,12 +407,12 @@ function do_shortcodes_in_html_tags( $content, $ignore_html, $tagnames ) {
// In this specific situation we assume KSES did not run because the input
// was written by an administrator, so we should avoid changing the output
// and we do not need to run KSES here.
$attr = preg_replace_callback( "/$pattern/s", 'do_shortcode_tag', $attr );
$attr = preg_replace_callback( "/$pattern/", 'do_shortcode_tag', $attr );
} else {
// $attr like 'name = "[shortcode]"' or "name = '[shortcode]'"
// We do not know if $content was unfiltered. Assume KSES ran before shortcodes.
$count = 0;
$new_attr = preg_replace_callback( "/$pattern/s", 'do_shortcode_tag', $attr, -1, $count );
$new_attr = preg_replace_callback( "/$pattern/", 'do_shortcode_tag', $attr, -1, $count );
if ( $count > 0 ) {
// Sanitize the shortcode output using KSES.
$new_attr = wp_kses_one_attr( $new_attr, $elname );
@ -572,7 +572,7 @@ function strip_shortcodes( $content ) {
$content = do_shortcodes_in_html_tags( $content, true, $tagnames );
$pattern = get_shortcode_regex( $tagnames );
$content = preg_replace_callback( "/$pattern/s", 'strip_shortcode_tag', $content );
$content = preg_replace_callback( "/$pattern/", 'strip_shortcode_tag', $content );
// Always restore square braces so we don't break things like <!--[if IE ]>
$content = unescape_invalid_shortcodes( $content );

File diff suppressed because it is too large Load Diff

View File

@ -390,3 +390,59 @@ class wpdb_exposed_methods_for_testing extends wpdb {
return call_user_func_array( array( $this, $name ), $arguments );
}
}
/**
* Determine approximate backtrack count when running PCRE.
*
* @return int The backtrack count.
*/
function benchmark_pcre_backtracking( $pattern, $subject, $strategy ) {
$saved_config = ini_get( 'pcre.backtrack_limit' );
// Attempt to prevent PHP crashes. Adjust these lower when needed.
if ( version_compare( phpversion(), '5.4.8', '>' ) ) {
$limit = 1000000;
} else {
$limit = 20000; // 20,000 is a reasonable upper limit, but see also https://core.trac.wordpress.org/ticket/29557#comment:10
}
// Start with small numbers, so if a crash is encountered at higher numbers we can still debug the problem.
for( $i = 4; $i <= $limit; $i *= 2 ) {
ini_set( 'pcre.backtrack_limit', $i );
switch( $strategy ) {
case 'split':
preg_split( $pattern, $subject );
break;
case 'match':
preg_match( $pattern, $subject );
break;
case 'match_all':
preg_match_all( $pattern, $subject );
break;
}
ini_set( 'pcre.backtrack_limit', $saved_config );
switch( preg_last_error() ) {
case PREG_NO_ERROR:
return $i;
case PREG_BACKTRACK_LIMIT_ERROR:
continue;
case PREG_RECURSION_LIMIT_ERROR:
trigger_error('PCRE recursion limit encountered before backtrack limit.');
break;
case PREG_BAD_UTF8_ERROR:
trigger_error('UTF-8 error during PCRE benchmark.');
break;
case PREG_INTERNAL_ERROR:
trigger_error('Internal error during PCRE benchmark.');
break;
default:
trigger_error('Unexpected error during PCRE benchmark.');
}
}
return $i;
}

View File

@ -2048,4 +2048,29 @@ String with a number followed by a single quote !q1!Expendables 3!q1! vestibulum
),
);
}
/**
* Automated performance testing of the main regex.
*
* @dataProvider data_whole_posts
*/
function test_pcre_performance( $input ) {
global $shortcode_tags;
// With Shortcodes Disabled
$regex = _get_wptexturize_split_regex( );
$result = benchmark_pcre_backtracking( $regex, $input, 'split' );
$this->assertLessThan( 200, $result );
// With Shortcodes Enabled
$shortcode_regex = _get_wptexturize_shortcode_regex( array_keys( $shortcode_tags ) );
$regex = _get_wptexturize_split_regex( $shortcode_regex );
$result = benchmark_pcre_backtracking( $regex, $input, 'split' );
return $this->assertLessThan( 200, $result );
}
function data_whole_posts() {
require_once( DIR_TESTDATA . '/formatting/whole-posts.php' );
return data_whole_posts();
}
}

View File

@ -616,4 +616,20 @@ EOF;
),
);
}
/**
* Automated performance testing of the main regex.
*
* @dataProvider data_whole_posts
*/
function test_pcre_performance( $input ) {
$regex = '/' . get_shortcode_regex() . '/';
$result = benchmark_pcre_backtracking( $regex, $input, 'match_all' );
return $this->assertLessThan( 200, $result );
}
function data_whole_posts() {
require_once( DIR_TESTDATA . '/formatting/whole-posts.php' );
return data_whole_posts();
}
}