Formatting: Improve accuracy of force_balance_tags() and add support for custom element tags.

This changeset includes a major iteration on the regular expression used to balance tags, with comprehensive test coverage to ensure that all scenarios are supported or unsupported as expected.

Props dmsnell, westonruter, birgire.
Fixes #47014.


git-svn-id: https://develop.svn.wordpress.org/trunk@45929 602fd350-edb4-49c9-b593-d223f7449a82
This commit is contained in:
Felix Arntz 2019-09-02 10:24:18 +00:00
parent 122cb2864b
commit ac2df4e8db
2 changed files with 299 additions and 44 deletions

View File

@ -2429,7 +2429,7 @@ function convert_invalid_entities( $content ) {
* @return string Balanced text * @return string Balanced text
*/ */
function balanceTags( $text, $force = false ) { // phpcs:ignore WordPress.NamingConventions.ValidFunctionName.FunctionNameInvalid function balanceTags( $text, $force = false ) { // phpcs:ignore WordPress.NamingConventions.ValidFunctionName.FunctionNameInvalid
if ( $force || get_option( 'use_balanceTags' ) == 1 ) { if ( $force || (int) get_option( 'use_balanceTags' ) === 1 ) {
return force_balance_tags( $text ); return force_balance_tags( $text );
} else { } else {
return $text; return $text;
@ -2440,6 +2440,7 @@ function balanceTags( $text, $force = false ) { // phpcs:ignore WordPress.Namin
* Balances tags of string using a modified stack. * Balances tags of string using a modified stack.
* *
* @since 2.0.4 * @since 2.0.4
* @since 5.3.0 Improve accuracy and add support for custom element tags.
* *
* @author Leonard Lin <leonard@acm.org> * @author Leonard Lin <leonard@acm.org>
* @license GPL * @license GPL
@ -2469,32 +2470,74 @@ function force_balance_tags( $text ) {
// WP bug fix for LOVE <3 (and other situations with '<' before a number) // WP bug fix for LOVE <3 (and other situations with '<' before a number)
$text = preg_replace( '#<([0-9]{1})#', '&lt;$1', $text ); $text = preg_replace( '#<([0-9]{1})#', '&lt;$1', $text );
while ( preg_match( '/<(\/?[\w:]*)\s*([^>]*)>/', $text, $regex ) ) { /**
* Matches supported tags.
*
* To get the pattern as a string without the comments paste into a PHP
* REPL like `php -a`.
*
* @see https://html.spec.whatwg.org/#elements-2
* @see https://w3c.github.io/webcomponents/spec/custom/#valid-custom-element-name
*
* @example
* ~# php -a
* php > $s = [paste copied contents of expression below including parentheses];
* php > echo $s;
*/
$tag_pattern = (
'#<' . // Start with an opening bracket.
'(/?)' . // Group 1 - If it's a closing tag it'll have a leading slash.
'(' . // Group 2 - Tag name.
// Custom element tags have more lenient rules than HTML tag names.
'(?:[a-z](?:[a-z0-9._]*)-(?:[a-z0-9._-]+)+)' .
'|' .
// Traditional tag rules approximate HTML tag names.
'(?:[\w:]+)' .
')' .
'(?:' .
// We either immediately close the tag with its '>' and have nothing here.
'\s*' .
'(/?)' . // Group 3 - "attributes" for empty tag.
'|' .
// Or we must start with space characters to separate the tag name from the attributes (or whitespace).
'(\s+)' . // Group 4 - Pre-attribute whitespace.
'([^>]*)' . // Group 5 - Attributes.
')' .
'>#' // End with a closing bracket.
);
while ( preg_match( $tag_pattern, $text, $regex ) ) {
$full_match = $regex[0];
$has_leading_slash = ! empty( $regex[1] );
$tag_name = $regex[2];
$tag = strtolower( $tag_name );
$is_single_tag = in_array( $tag, $single_tags, true );
$pre_attribute_ws = isset( $regex[4] ) ? $regex[4] : '';
$attributes = trim( isset( $regex[5] ) ? $regex[5] : $regex[3] );
$has_self_closer = '/' === substr( $attributes, -1 );
$newtext .= $tagqueue; $newtext .= $tagqueue;
$i = strpos( $text, $regex[0] ); $i = strpos( $text, $full_match );
$l = strlen( $regex[0] ); $l = strlen( $full_match );
// clear the shifter // Clear the shifter.
$tagqueue = ''; $tagqueue = '';
// Pop or Push if ( $has_leading_slash ) { // End Tag.
if ( isset( $regex[1][0] ) && '/' == $regex[1][0] ) { // End Tag // If too many closing tags.
$tag = strtolower( substr( $regex[1], 1 ) );
// if too many closing tags
if ( $stacksize <= 0 ) { if ( $stacksize <= 0 ) {
$tag = ''; $tag = '';
// or close to be safe $tag = '/' . $tag; // Or close to be safe $tag = '/' . $tag.
// if stacktop value = tag close value then pop // If stacktop value = tag close value, then pop.
} elseif ( $tagstack[ $stacksize - 1 ] == $tag ) { // found closing tag } elseif ( $tagstack[ $stacksize - 1 ] === $tag ) { // Found closing tag.
$tag = '</' . $tag . '>'; // Close Tag $tag = '</' . $tag . '>'; // Close Tag.
// Pop
array_pop( $tagstack ); array_pop( $tagstack );
$stacksize--; $stacksize--;
} else { // closing tag not at top, search for it } else { // Closing tag not at top, search for it.
for ( $j = $stacksize - 1; $j >= 0; $j-- ) { for ( $j = $stacksize - 1; $j >= 0; $j-- ) {
if ( $tagstack[ $j ] == $tag ) { if ( $tagstack[ $j ] === $tag ) {
// add tag to tagqueue // Add tag to tagqueue.
for ( $k = $stacksize - 1; $k >= $j; $k-- ) { for ( $k = $stacksize - 1; $k >= $j; $k-- ) {
$tagqueue .= '</' . array_pop( $tagstack ) . '>'; $tagqueue .= '</' . array_pop( $tagstack ) . '>';
$stacksize--; $stacksize--;
@ -2504,39 +2547,33 @@ function force_balance_tags( $text ) {
} }
$tag = ''; $tag = '';
} }
} else { // Begin Tag } else { // Begin Tag.
$tag = strtolower( $regex[1] ); if ( $has_self_closer ) { // If it presents itself as a self-closing tag...
// Tag Cleaning
// If it's an empty tag "< >", do nothing
if ( '' == $tag ) {
// do nothing
} elseif ( substr( $regex[2], -1 ) == '/' ) { // ElseIf it presents itself as a self-closing tag...
// ...but it isn't a known single-entity self-closing tag, then don't let it be treated as such and // ...but it isn't a known single-entity self-closing tag, then don't let it be treated as such and
// immediately close it with a closing tag (the tag will encapsulate no text as a result) // immediately close it with a closing tag (the tag will encapsulate no text as a result)
if ( ! in_array( $tag, $single_tags ) ) { if ( ! $is_single_tag ) {
$regex[2] = trim( substr( $regex[2], 0, -1 ) ) . "></$tag"; $attributes = trim( substr( $attributes, 0, -1 ) ) . "></$tag";
} }
} elseif ( in_array( $tag, $single_tags ) ) { // ElseIf it's a known single-entity tag but it doesn't close itself, do so } elseif ( $is_single_tag ) { // ElseIf it's a known single-entity tag but it doesn't close itself, do so
$regex[2] .= '/'; $pre_attribute_ws = ' ';
} else { // Else it's not a single-entity tag $attributes .= '/';
// If the top of the stack is the same as the tag we want to push, close previous tag } else { // It's not a single-entity tag.
if ( $stacksize > 0 && ! in_array( $tag, $nestable_tags ) && $tagstack[ $stacksize - 1 ] == $tag ) { // If the top of the stack is the same as the tag we want to push, close previous tag.
if ( $stacksize > 0 && ! in_array( $tag, $nestable_tags, true ) && $tagstack[ $stacksize - 1 ] === $tag ) {
$tagqueue = '</' . array_pop( $tagstack ) . '>'; $tagqueue = '</' . array_pop( $tagstack ) . '>';
$stacksize--; $stacksize--;
} }
$stacksize = array_push( $tagstack, $tag ); $stacksize = array_push( $tagstack, $tag );
} }
// Attributes // Attributes.
$attributes = $regex[2]; if ( $has_self_closer && $is_single_tag ) {
if ( ! empty( $attributes ) && $attributes[0] != '>' ) { // We need some space - avoid <br/> and prefer <br />.
$attributes = ' ' . $attributes; $pre_attribute_ws = ' ';
} }
$tag = '<' . $tag . $attributes . '>'; $tag = '<' . $tag . $pre_attribute_ws . $attributes . '>';
//If already queuing a close tag, then put this tag on, too // If already queuing a close tag, then put this tag on too.
if ( ! empty( $tagqueue ) ) { if ( ! empty( $tagqueue ) ) {
$tagqueue .= $tag; $tagqueue .= $tag;
$tag = ''; $tag = '';
@ -2546,18 +2583,17 @@ function force_balance_tags( $text ) {
$text = substr( $text, $i + $l ); $text = substr( $text, $i + $l );
} }
// Clear Tag Queue // Clear Tag Queue.
$newtext .= $tagqueue; $newtext .= $tagqueue;
// Add Remaining text // Add remaining text.
$newtext .= $text; $newtext .= $text;
// Empty Stack
while ( $x = array_pop( $tagstack ) ) { while ( $x = array_pop( $tagstack ) ) {
$newtext .= '</' . $x . '>'; // Add remaining tags to close $newtext .= '</' . $x . '>'; // Add remaining tags to close.
} }
// WP fix for the bug with HTML comments // WP fix for the bug with HTML comments.
$newtext = str_replace( '< !--', '<!--', $newtext ); $newtext = str_replace( '< !--', '<!--', $newtext );
$newtext = str_replace( '< !--', '< !--', $newtext ); $newtext = str_replace( '< !--', '< !--', $newtext );

View File

@ -37,6 +37,158 @@ class Tests_Formatting_BalanceTags extends WP_UnitTestCase {
); );
} }
function supported_traditional_tag_names() {
return array(
array( 'a' ),
array( 'div' ),
array( 'blockquote' ),
// HTML tag names can be CAPITALIZED and are case-insensitive.
array( 'A' ),
array( 'dIv' ),
array( 'BLOCKQUOTE' ),
);
}
function supported_custom_element_tag_names() {
return array(
array( 'custom-element' ),
array( 'my-custom-element' ),
array( 'weekday-5-item' ),
array( 'a-big-old-tag-name' ),
array( 'with_underscores-and_the_dash' ),
array( 'a-.' ),
array( 'a._-.-_' ),
);
}
function invalid_tag_names() {
return array(
array( '<0-day>inside', '&lt;0-day>inside' ), // Can't start with a number - handled by the "<3" fix.
array( '<UPPERCASE-TAG>inside', '<UPPERCASE-TAG>inside' ), // Custom elements cannot be uppercase.
);
}
/**
* These are valid custom elements but we don't support them yet.
*
* @see https://w3c.github.io/webcomponents/spec/custom/#valid-custom-element-name
*/
function unsupported_valid_tag_names() {
return array(
// We don't allow ending in a dash.
array( '<what->inside' ),
// Examples from the spec working document.
array( 'math-α' ),
array( 'emotion-😍' ),
// UNICODE ranges
// 0x00b7
array( 'b-·' ),
// Latin characters with accents/modifiers.
// 0x00c0-0x00d6
// 0x00d8-0x00f6
array( 'a-À-Ó-Ý' ),
// 0x00f8-0x037d
array( 'a-ͳ' ),
// No 0x037e, which is a Greek semicolon.
// 0x037f-0x1fff
array( 'a-Ფ' ),
// Zero-width characters, probably never supported.
// 0x200c-0x200d
array( 'a-to-my-left-is-a-zero-width-non-joiner-do-not-delete-it' ),
array( 'a-to-my-left-is-a-zero-width-joiner-do-not-delete-it' ),
// Ties.
// 0x203f-0x2040
array( 'under-‿-tie' ),
array( 'over-⁀-tie' ),
// 0x2170-0x218f
array( 'a-⁰' ),
array( 'a-⅀' ),
array( 'tag-ↀ-it' ),
// 0x2c00-0x2fef
array( 'a-Ⰰ' ),
array( 'b-ⴓ-c' ),
array( 'd-⽗' ),
// 0x3001-0xd7ff
array( 'a-、' ),
array( 'z-态' ),
array( 'a-送-䠺-ퟱ-퟿' ),
// 0xf900-0xfdcf
array( 'a-豈' ),
array( 'my-切' ),
array( 'aﴀ-tag' ),
array( 'my-﷌' ),
// 0xfdf0-0xfffd
array( 'a-ﷰ' ),
array( 'a---<2D>' ), // Warning; blank characters are in there.
// Extended ranges.
// 0x10000-0xeffff
array( 'a-𐀀' ),
array( 'my-𝀀' ),
array( 'a𞀀-𜿐' ),
);
}
/**
* These are invalid custom elements but we support them right now in order to keep the parser simpler.
*
* @see https://w3c.github.io/webcomponents/spec/custom/#valid-custom-element-name
*/
function supported_invalid_tag_names() {
return array(
// Reserved names for custom elements.
array( 'annotation-xml' ),
array( 'color-profile' ),
array( 'font-face' ),
array( 'font-face-src' ),
array( 'font-face-uri' ),
array( 'font-face-format' ),
array( 'font-face-name' ),
array( 'missing-glyph' ),
);
}
/**
* @ticket 47014
* @dataProvider supported_traditional_tag_names
*/
function test_detects_traditional_tag_names( $tag ) {
$normalized = strtolower( $tag );
$this->assertEquals( "<$normalized>inside</$normalized>", balanceTags( "<$tag>inside", true ) );
}
/**
* @ticket 47014
* @dataProvider supported_custom_element_tag_names
*/
function test_detects_supported_custom_element_tag_names( $tag ) {
$this->assertEquals( "<$tag>inside</$tag>", balanceTags( "<$tag>inside", true ) );
}
/**
* @ticket 47014
* @dataProvider invalid_tag_names
*/
function test_ignores_invalid_tag_names( $input, $output ) {
$this->assertEquals( $output, balanceTags( $input, true ) );
}
/**
* @ticket 47014
* @dataProvider unsupported_valid_tag_names
*/
function test_ignores_unsupported_custom_tag_names( $tag ) {
$this->assertEquals( "<$tag>inside", balanceTags( "<$tag>inside", true ) );
}
/**
* @ticket 47014
* @dataProvider supported_invalid_tag_names
*/
function test_detects_supported_invalid_tag_names( $tag ) {
$this->assertEquals( "<$tag>inside</$tag>", balanceTags( "<$tag>inside", true ) );
}
/** /**
* If a recognized valid single tag appears unclosed, it should get self-closed * If a recognized valid single tag appears unclosed, it should get self-closed
* *
@ -68,12 +220,15 @@ class Tests_Formatting_BalanceTags extends WP_UnitTestCase {
'<em />', '<em />',
'<p class="main1"/>', '<p class="main1"/>',
'<p class="main2" />', '<p class="main2" />',
'<STRONG/>',
); );
$expected = array( $expected = array(
'<strong></strong>', '<strong></strong>',
'<em></em>', '<em></em>',
'<p class="main1"></p>', '<p class="main1"></p>',
'<p class="main2"></p>', '<p class="main2"></p>',
// Valid tags are transformed to lowercase.
'<strong></strong>',
); );
foreach ( $inputs as $key => $input ) { foreach ( $inputs as $key => $input ) {
@ -221,4 +376,68 @@ class Tests_Formatting_BalanceTags extends WP_UnitTestCase {
} }
} }
/**
* Get custom element data.
*
* @return array Data.
*/
public function data_custom_elements() {
return array(
// Valid custom element tags.
array(
'<my-custom-element data-attribute="value"/>',
'<my-custom-element data-attribute="value"></my-custom-element>',
),
array(
'<my-custom-element>Test</my-custom-element>',
'<my-custom-element>Test</my-custom-element>',
),
array(
'<my-custom-element>Test',
'<my-custom-element>Test</my-custom-element>',
),
array(
'Test</my-custom-element>',
'Test',
),
array(
'</my-custom-element>Test',
'Test',
),
array(
'<my-custom-element/>',
'<my-custom-element></my-custom-element>',
),
array(
'<my-custom-element />',
'<my-custom-element></my-custom-element>',
),
// Invalid (or at least temporarily unsupported) custom element tags.
array(
'<MY-CUSTOM-ELEMENT>Test',
'<MY-CUSTOM-ELEMENT>Test',
),
array(
'<my->Test',
'<my->Test',
),
array(
'<--->Test',
'<--->Test',
),
);
}
/**
* Test custom elements.
*
* @ticket 47014
* @dataProvider data_custom_elements
*
* @param string $source Source.
* @param string $expected Expected.
*/
public function test_custom_elements( $source, $expected ) {
$this->assertEquals( $expected, balanceTags( $source, true ) );
}
} }