wptexturize() improvements:

* Make sure that strings ending with a number and quotation mark get the proper smart quotes
* Introduce `wptexturize_primes()`, a logic tree to determine whether or not "7'." represents seven feet, then converts the special char into either a prime char or a closing quote char.

Adds unit tests.

Props miqrogroove.
Fixes #29256.


git-svn-id: https://develop.svn.wordpress.org/trunk@32863 602fd350-edb4-49c9-b593-d223f7449a82
This commit is contained in:
Scott Taylor 2015-06-19 20:05:52 +00:00
parent da826f59a5
commit ed8b9a8d27
2 changed files with 243 additions and 32 deletions

View File

@ -46,7 +46,17 @@ function wptexturize( $text, $reset = false ) {
$dynamic_replacements = null,
$default_no_texturize_tags = null,
$default_no_texturize_shortcodes = null,
$run_texturize = true;
$run_texturize = true,
$apos = null,
$prime = null,
$double_prime = null,
$opening_quote = null,
$closing_quote = null,
$opening_single_quote = null,
$closing_single_quote = null,
$open_q_flag = '<!--oq-->',
$open_sq_flag = '<!--osq-->',
$apos_flag = '<!--apos-->';
// If there's nothing to do, just stop.
if ( empty( $text ) || false === $run_texturize ) {
@ -129,40 +139,30 @@ function wptexturize( $text, $reset = false ) {
// '99' and '99" are ambiguous among other patterns; assume it's an abbreviated year at the end of a quotation.
if ( "'" !== $apos || "'" !== $closing_single_quote ) {
$dynamic[ '/\'(\d\d)\'(?=\Z|[.,:;!?)}\-\]]|&gt;|' . $spaces . ')/' ] = $apos . '$1' . $closing_single_quote;
$dynamic[ '/\'(\d\d)\'(?=\Z|[.,:;!?)}\-\]]|&gt;|' . $spaces . ')/' ] = $apos_flag . '$1' . $closing_single_quote;
}
if ( "'" !== $apos || '"' !== $closing_quote ) {
$dynamic[ '/\'(\d\d)"(?=\Z|[.,:;!?)}\-\]]|&gt;|' . $spaces . ')/' ] = $apos . '$1' . $closing_quote;
$dynamic[ '/\'(\d\d)"(?=\Z|[.,:;!?)}\-\]]|&gt;|' . $spaces . ')/' ] = $apos_flag . '$1' . $closing_quote;
}
// '99 '99s '99's (apostrophe) But never '9 or '99% or '999 or '99.0.
if ( "'" !== $apos ) {
$dynamic[ '/\'(?=\d\d(?:\Z|(?![%\d]|[.,]\d)))/' ] = $apos;
$dynamic[ '/\'(?=\d\d(?:\Z|(?![%\d]|[.,]\d)))/' ] = $apos_flag;
}
// Quoted Numbers like '0.42'
if ( "'" !== $opening_single_quote && "'" !== $closing_single_quote ) {
$dynamic[ '/(?<=\A|' . $spaces . ')\'(\d[.,\d]*)\'/' ] = $opening_single_quote . '$1' . $closing_single_quote;
$dynamic[ '/(?<=\A|' . $spaces . ')\'(\d[.,\d]*)\'/' ] = $open_sq_flag . '$1' . $closing_single_quote;
}
// Single quote at start, or preceded by (, {, <, [, ", -, or spaces.
if ( "'" !== $opening_single_quote ) {
$dynamic[ '/(?<=\A|[([{"\-]|&lt;|' . $spaces . ')\'/' ] = $opening_single_quote;
$dynamic[ '/(?<=\A|[([{"\-]|&lt;|' . $spaces . ')\'/' ] = $open_sq_flag;
}
// Apostrophe in a word. No spaces, double apostrophes, or other punctuation.
if ( "'" !== $apos ) {
$dynamic[ '/(?<!' . $spaces . ')\'(?!\Z|[.,:;!?"\'(){}[\]\-]|&[lg]t;|' . $spaces . ')/' ] = $apos;
}
// 9' (prime)
if ( "'" !== $prime ) {
$dynamic[ '/(?<=\d)\'/' ] = $prime;
}
// Single quotes followed by spaces or ending punctuation.
if ( "'" !== $closing_single_quote ) {
$dynamic[ '/\'(?=\Z|[.,:;!?)}\-\]]|&gt;|' . $spaces . ')/' ] = $closing_single_quote;
$dynamic[ '/(?<!' . $spaces . ')\'(?!\Z|[.,:;!?"\'(){}[\]\-]|&[lg]t;|' . $spaces . ')/' ] = $apos_flag;
}
$dynamic_characters['apos'] = array_keys( $dynamic );
@ -171,22 +171,12 @@ function wptexturize( $text, $reset = false ) {
// Quoted Numbers like "42"
if ( '"' !== $opening_quote && '"' !== $closing_quote ) {
$dynamic[ '/(?<=\A|' . $spaces . ')"(\d[.,\d]*)"/' ] = $opening_quote . '$1' . $closing_quote;
}
// 9" (double prime)
if ( '"' !== $double_prime ) {
$dynamic[ '/(?<=\d)"/' ] = $double_prime;
$dynamic[ '/(?<=\A|' . $spaces . ')"(\d[.,\d]*)"/' ] = $open_q_flag . '$1' . $closing_quote;
}
// Double quote at start, or preceded by (, {, <, [, -, or spaces, and not followed by spaces.
if ( '"' !== $opening_quote ) {
$dynamic[ '/(?<=\A|[([{\-]|&lt;|' . $spaces . ')"(?!' . $spaces . ')/' ] = $opening_quote;
}
// Any remaining double quotes.
if ( '"' !== $closing_quote ) {
$dynamic[ '/"/' ] = $closing_quote;
$dynamic[ '/(?<=\A|[([{\-]|&lt;|' . $spaces . ')"(?!' . $spaces . ')/' ] = $open_q_flag;
}
$dynamic_characters['quote'] = array_keys( $dynamic );
@ -300,9 +290,14 @@ function wptexturize( $text, $reset = false ) {
if ( false !== strpos( $curl, "'" ) ) {
$curl = preg_replace( $dynamic_characters['apos'], $dynamic_replacements['apos'], $curl );
$curl = wptexturize_primes( $curl, "'", $prime, $open_sq_flag, $closing_single_quote );
$curl = str_replace( $apos_flag, $apos, $curl );
$curl = str_replace( $open_sq_flag, $opening_single_quote, $curl );
}
if ( false !== strpos( $curl, '"' ) ) {
$curl = preg_replace( $dynamic_characters['quote'], $dynamic_replacements['quote'], $curl );
$curl = wptexturize_primes( $curl, '"', $double_prime, $open_q_flag, $closing_quote );
$curl = str_replace( $open_q_flag, $opening_quote, $curl );
}
if ( false !== strpos( $curl, '-' ) ) {
$curl = preg_replace( $dynamic_characters['dash'], $dynamic_replacements['dash'], $curl );
@ -321,6 +316,74 @@ function wptexturize( $text, $reset = false ) {
return preg_replace( '/&(?!#(?:\d+|x[a-f0-9]+);|[a-z1-4]{1,8};)/i', '&#038;', $text );
}
/**
* Implements a logic tree to determine whether or not "7'." represents seven feet,
* then converts the special char into either a prime char or a closing quote char.
*
* @since 4.3.0
*
* @param string $haystack The plain text to be searched.
* @param string $needle The character to search for such as ' or ".
* @param string $prime The prime char to use for replacement.
* @param string $open_quote The opening quote char. Opening quote replacement must be accomplished already.
* @param string $close_quote The closing quote char to use for replacement.
* @return string The $haystack value after primes and quotes replacements.
*/
function wptexturize_primes( $haystack, $needle, $prime, $open_quote, $close_quote ) {
$spaces = wp_spaces_regexp();
$flag = '<!--wp-prime-or-quote-->';
$quote_pattern = "/$needle(?=\\Z|[.,:;!?)}\\-\\]]|&gt;|" . $spaces . ")/";
$prime_pattern = "/(?<=\\d)$needle/";
$flag_after_digit = "/(?<=\\d)$flag/";
$flag_no_digit = "/(?<!\\d)$flag/";
$sentences = explode( $open_quote, $haystack );
foreach( $sentences as $key => &$sentence ) {
if ( false === strpos( $sentence, $needle ) ) {
continue;
} elseif ( 0 !== $key && 0 === substr_count( $sentence, $close_quote ) ) {
$sentence = preg_replace( $quote_pattern, $flag, $sentence, -1, $count );
if ( $count > 1 ) {
// This sentence appears to have multiple closing quotes. Attempt Vulcan logic.
$sentence = preg_replace( $flag_no_digit, $close_quote, $sentence, -1, $count2 );
if ( 0 === $count2 ) {
// Try looking for a quote followed by a period.
$count2 = substr_count( $sentence, "$flag." );
if ( $count2 > 0 ) {
// Assume the rightmost quote-period match is the end of quotation.
$pos = strrpos( $sentence, "$flag." );
} else {
// When all else fails, make the rightmost candidate a closing quote.
// This is most likely to be problematic in the context of bug #18549.
$pos = strrpos( $sentence, $flag );
}
$sentence = substr_replace( $sentence, $close_quote, $pos, strlen( $flag ) );
}
// Use conventional replacement on any remaining primes and quotes.
$sentence = preg_replace( $prime_pattern, $prime, $sentence );
$sentence = preg_replace( $flag_after_digit, $prime, $sentence );
$sentence = str_replace( $flag, $close_quote, $sentence );
} elseif ( 1 == $count ) {
// Found only one closing quote candidate, so give it priority over primes.
$sentence = str_replace( $flag, $close_quote, $sentence );
$sentence = preg_replace( $prime_pattern, $prime, $sentence );
} else {
// No closing quotes found. Just run primes pattern.
$sentence = preg_replace( $prime_pattern, $prime, $sentence );
}
} else {
$sentence = preg_replace( $prime_pattern, $prime, $sentence );
$sentence = preg_replace( $quote_pattern, $close_quote, $sentence );
}
if ( '"' == $needle && false !== strpos( $sentence, '"' ) ) {
$sentence = str_replace( '"', $close_quote, $sentence );
}
}
return implode( $open_quote, $sentences );
}
/**
* Search for disabled element tags. Push element to stack on tag open and pop
* on tag close.

View File

@ -90,8 +90,8 @@ class Tests_Formatting_WPTexturize extends WP_UnitTestCase {
//$this->assertEquals('Here is &#8220;<a href="http://example.com">a test with a link</a>&#8221;&#8230; and ellipses.', wptexturize('Here is "<a href="http://example.com">a test with a link</a>"... and ellipses.'));
//$this->assertEquals('Here is &#8220;a test <a href="http://example.com">with a link</a>&#8221;.', wptexturize('Here is "a test <a href="http://example.com">with a link</a>".'));
//$this->assertEquals('Here is &#8220;<a href="http://example.com">a test with a link</a>&#8221;and a work stuck to the end.', wptexturize('Here is "<a href="http://example.com">a test with a link</a>"and a work stuck to the end.'));
//$this->assertEquals('A test with a finishing number, &#8220;like 23&#8221;.', wptexturize('A test with a finishing number, "like 23".'));
//$this->assertEquals('A test with a number, &#8220;like 62&#8221;, is nice to have.', wptexturize('A test with a number, "like 62", is nice to have.'));
$this->assertEquals('A test with a finishing number, &#8220;like 23&#8221;.', wptexturize('A test with a finishing number, "like 23".'));
$this->assertEquals('A test with a number, &#8220;like 62&#8221;, is nice to have.', wptexturize('A test with a number, "like 62", is nice to have.'));
}
/**
@ -121,7 +121,7 @@ class Tests_Formatting_WPTexturize extends WP_UnitTestCase {
$this->assertEquals('&#8216;Class of &#8217;99&#8217;?', wptexturize("'Class of '99'?"));
$this->assertEquals('&#8216;Class of &#8217;99&#8217;s&#8217;', wptexturize("'Class of '99's'"));
$this->assertEquals('&#8216;Class of &#8217;99&#8217;s&#8217;', wptexturize("'Class of '99&#8217;s'"));
//$this->assertEquals('&#8220;Class of 99&#8221;', wptexturize("\"Class of 99\""));
$this->assertEquals('&#8220;Class of 99&#8221;', wptexturize("\"Class of 99\""));
$this->assertEquals('&#8220;Class of &#8217;99&#8221;', wptexturize("\"Class of '99\""));
$this->assertEquals('{&#8220;Class of &#8217;99&#8221;}', wptexturize("{\"Class of '99\"}"));
$this->assertEquals(' &#8220;Class of &#8217;99&#8221; ', wptexturize(" \"Class of '99\" "));
@ -1900,4 +1900,152 @@ class Tests_Formatting_WPTexturize extends WP_UnitTestCase {
),
);
}
/**
* Ensure primes logic is not too greedy at the end of a quotation.
*
* @ticket 29256
* @dataProvider data_primes_vs_quotes
*/
function test_primes_vs_quotes( $input, $output ) {
return $this->assertEquals( $output, wptexturize( $input ) );
}
function data_primes_vs_quotes() {
return array(
array(
"George's porch is 99' long.",
"George&#8217;s porch is 99&#8242; long.",
),
array(
'The best year "was that time in 2012" when everyone partied, he said.',
'The best year &#8220;was that time in 2012&#8221; when everyone partied, he said.',
),
array(
"I need 4 x 20' = 80' of trim.", // Works only with a space before the = char.
"I need 4 x 20&#8242; = 80&#8242; of trim.",
),
array(
'"Lorem ipsum dolor sit amet 1234"',
'&#8220;Lorem ipsum dolor sit amet 1234&#8221;',
),
array(
"'Etiam eu egestas dui 1234'",
"&#8216;Etiam eu egestas dui 1234&#8217;",
),
array(
'according to our source, "33% of all students scored less than 50" on the test.',
'according to our source, &#8220;33% of all students scored less than 50&#8221; on the test.',
),
array(
"The doctor said, 'An average height is between 5' and 6' in study group 7'. He then produced a 6' chart of averages. A man of 7', incredibly, is very possible.",
"The doctor said, &#8216;An average height is between 5&#8242; and 6&#8242; in study group 7&#8217;. He then produced a 6&#8242; chart of averages. A man of 7&#8242;, incredibly, is very possible.",
),
array(
'Pirates have voted on "The Expendables 3" with their clicks -- and it turns out the Sylvester Stallone-starrer hasn\'t been astoundingly popular among digital thieves, relatively speaking.
As of Sunday, 5.12 million people worldwide had pirated "Expendables 3" since a high-quality copy hit torrent-sharing sites July 23, according to piracy-tracking firm Excipio.
That likely contributed to the action movie\'s dismal box-office debut this weekend. But over the same July 23-Aug. 18 time period, the movie was No. 4 in downloads, after "Captain America: The Winter Soldier" (7.31 million), "Divergent" (6.29 million) and "The Amazing Spider-Man 2" (5.88 million). Moreover, that\'s despite "Expendables 3" becoming available more than three weeks prior to the film\'s U.S. theatrical debut.
String with a number followed by a single quote \'Expendables 3\' vestibulum in arcu mi.',
'Pirates have voted on &#8220;The Expendables 3&#8221; with their clicks &#8212; and it turns out the Sylvester Stallone-starrer hasn&#8217;t been astoundingly popular among digital thieves, relatively speaking.
As of Sunday, 5.12 million people worldwide had pirated &#8220;Expendables 3&#8221; since a high-quality copy hit torrent-sharing sites July 23, according to piracy-tracking firm Excipio.
That likely contributed to the action movie&#8217;s dismal box-office debut this weekend. But over the same July 23-Aug. 18 time period, the movie was No. 4 in downloads, after &#8220;Captain America: The Winter Soldier&#8221; (7.31 million), &#8220;Divergent&#8221; (6.29 million) and &#8220;The Amazing Spider-Man 2&#8221; (5.88 million). Moreover, that&#8217;s despite &#8220;Expendables 3&#8221; becoming available more than three weeks prior to the film&#8217;s U.S. theatrical debut.
String with a number followed by a single quote &#8216;Expendables 3&#8217; vestibulum in arcu mi.',
),
);
}
/**
* Make sure translation actually works.
*
* Also make sure opening and closing quotes are allowed to be identical.
*
* @ticket 29256
* @dataProvider data_primes_quotes_translation
*/
function test_primes_quotes_translation( $input, $output ) {
add_filter( 'gettext_with_context', array( $this, 'filter_translate2' ), 10, 4 );
$result = wptexturize( $input, true );
remove_filter( 'gettext_with_context', array( $this, 'filter_translate2' ), 10, 4 );
wptexturize( 'reset', true );
return $this->assertEquals( $output, $result );
}
function filter_translate2( $translations, $text, $context, $domain ) {
switch ($text) {
case '&#8211;' : return '!endash!';
case '&#8212;' : return '!emdash!';
case '&#8216;' : return '!q1!';
case '&#8217;' :
if ( 'apostrophe' == $context ) {
return '!apos!';
} else {
return '!q1!';
}
case '&#8220;' : return '!q2!';
case '&#8221;' : return '!q2!';
case '&#8242;' : return '!prime1!';
case '&#8243;' : return '!prime2!';
default : return $translations;
}
}
function data_primes_quotes_translation() {
return array(
array(
"George's porch is 99' long.",
"George!apos!s porch is 99!prime1! long.",
),
array(
'The best year "was that time in 2012" when everyone partied, he said.',
'The best year !q2!was that time in 2012!q2! when everyone partied, he said.',
),
array(
"I need 4 x 20' = 80' of trim.", // Works only with a space before the = char.
"I need 4 x 20!prime1! = 80!prime1! of trim.",
),
array(
'"Lorem ipsum dolor sit amet 1234"',
'!q2!Lorem ipsum dolor sit amet 1234!q2!',
),
array(
"'Etiam eu egestas dui 1234'",
"!q1!Etiam eu egestas dui 1234!q1!",
),
array(
'according to our source, "33% of all students scored less than 50" on the test.',
'according to our source, !q2!33% of all students scored less than 50!q2! on the test.',
),
array(
"The doctor said, 'An average height is between 5' and 6' in study group 7'. He then produced a 6' chart of averages. A man of 7', incredibly, is very possible.",
"The doctor said, !q1!An average height is between 5!prime1! and 6!prime1! in study group 7!q1!. He then produced a 6!prime1! chart of averages. A man of 7!prime1!, incredibly, is very possible.",
),
array(
'Pirates have voted on "The Expendables 3" with their clicks -- and it turns out the Sylvester Stallone-starrer hasn\'t been astoundingly popular among digital thieves, relatively speaking.
As of Sunday, 5.12 million people worldwide had pirated "Expendables 3" since a high-quality copy hit torrent-sharing sites July 23, according to piracy-tracking firm Excipio.
That likely contributed to the action movie\'s dismal box-office debut this weekend. But over the same July 23-Aug. 18 time period, the movie was No. 4 in downloads, after "Captain America: The Winter Soldier" (7.31 million), "Divergent" (6.29 million) and "The Amazing Spider-Man 2" (5.88 million). Moreover, that\'s despite "Expendables 3" becoming available more than three weeks prior to the film\'s U.S. theatrical debut.
String with a number followed by a single quote \'Expendables 3\' vestibulum in arcu mi.',
'Pirates have voted on !q2!The Expendables 3!q2! with their clicks !emdash! and it turns out the Sylvester Stallone-starrer hasn!apos!t been astoundingly popular among digital thieves, relatively speaking.
As of Sunday, 5.12 million people worldwide had pirated !q2!Expendables 3!q2! since a high-quality copy hit torrent-sharing sites July 23, according to piracy-tracking firm Excipio.
That likely contributed to the action movie!apos!s dismal box-office debut this weekend. But over the same July 23-Aug. 18 time period, the movie was No. 4 in downloads, after !q2!Captain America: The Winter Soldier!q2! (7.31 million), !q2!Divergent!q2! (6.29 million) and !q2!The Amazing Spider-Man 2!q2! (5.88 million). Moreover, that!apos!s despite !q2!Expendables 3!q2! becoming available more than three weeks prior to the film!apos!s U.S. theatrical debut.
String with a number followed by a single quote !q1!Expendables 3!q1! vestibulum in arcu mi.',
),
);
}
}