Emoji: Port the Twemoji regex to PHP.

Previously, `wp_encode_emoji()` and `wp_staticize_emoji()` used inaccurate regular expressions to find emoji, and transform then into HTML entities or `<img>`s, respectively. This would result in emoji not being correctly transformed, or occasionally, non-emoji being incorrectly transformed.

This commit adds a new `grunt` task - `grunt precommit:emoji`. It finds the regex in `twemoji.js`, transforms it into a PHP-friendly version, and adds it to `formatting.php`. This task is also automatically run by `grunt precommit`, when it detects that `twemoji.js` has changed.

The new regex requires features introduced in PCRE 8.32, which was introduced in PHP 5.4.14, though it was also backported to later releases of the PHP 5.3 series. For versions of PHP that don't support this, it will fall back to an updated version of the loose-matching regex.

For short posts, the performance difference between the old and new regex is negligible. As the posts get longer, however, the new method is exponentially faster.

Fixes #35293.



git-svn-id: https://develop.svn.wordpress.org/trunk@41043 602fd350-edb4-49c9-b593-d223f7449a82
This commit is contained in:
Gary Pendergast 2017-07-14 05:46:19 +00:00
parent 666b9d7ccd
commit f780ce0a49
5 changed files with 278 additions and 65 deletions

View File

@ -607,6 +607,97 @@ module.exports = function(grunt) {
dest: '.'
}
},
replace: {
emojiRegex: {
options: {
patterns: [
{
match: /\/\/ START: emoji regex[\S\s]*\/\/ END: emoji regex/g,
replacement: function () {
var twemoji = grunt.file.read( SOURCE_DIR + 'wp-includes/js/twemoji.js' ),
found = twemoji.match( /re = \/(.*)\/g,/ ),
emojiRegex = found[1],
regex = '',
entities = '';
/*
* Twemoji does some nifty regex optimisations, splitting up surrogate pairs unit, searching by
* ranges of individual units, and compressing sets of individual units. This is super useful for
* reducing the size of the regex.
*
* Unfortunately, PCRE doesn't allow regexes to search for individual units, so we can't just
* blindly copy the Twemoji regex.
*
* The good news is, we don't have to worry about size restrictions, so we can just unravel the
* entire regex, and convert it to a PCRE-friendly format.
*/
// Convert ranges: "\udc68-\udc6a" becomes "\udc68\udc69\udc6a".
emojiRegex = emojiRegex.replace( /(\\u\w{4})\-(\\u\w{4})/g, function ( match, first, last ) {
var start = parseInt( first.substr( 2 ), 16 );
var end = parseInt( last.substr( 2 ), 16 );
var replace = '';
for( var counter = start; counter <= end; counter++ ) {
replace += '\\u' + counter.toString( 16 );
}
return replace;
} );
// Convert sets: "\u200d[\u2640\u2642]\ufe0f" becomes "\u200d\u2640\ufe0f|\u200d\u2642\ufe0f".
emojiRegex = emojiRegex.replace( /((?:\\u\w{4})*)\[((?:\\u\w{4})+)\]((?:\\u\w{4})*)/g, function ( match, before, middle, after ) {
//return params[1].split( '\\u' ).join( '|' + params[0] + '\\u' ).substr( 1 );
if ( ! before && ! after ) {
return match;
}
var set = middle.match( /.{1,6}/g );
return before + set.join( after + '|' + before ) + after;
} );
// Convert surrogate pairs to their equivalent unicode scalar: "\ud83d\udc68" becomes "\u1f468".
emojiRegex = emojiRegex.replace( /(\\ud[89a-f][0-9a-f]{2})(\\ud[89a-f][0-9a-f]{2})/g, function ( match, first, second ) {
var high = parseInt( first.substr( 2 ), 16 );
var low = parseInt( second.substr( 2 ), 16 );
var scalar = ( ( high - 0xD800 ) * 0x400 ) + ( low - 0xDC00 ) + 0x10000;
return '\\u' + scalar.toString( 16 );
} );
// Convert JavaScript-style code points to PHP-style: "\u1f468" becomes "\x{1f468}".
emojiRegex = emojiRegex.replace( /\\u(\w+)/g, '\\x{$1}' );
// Convert PHP-style code points to HTML entities: "\x{1f468}" becomes "&#x1f468;".
entities = emojiRegex.replace( /\\x{(\w+)}/g, '&#x$1;' );
entities = entities.replace( /\[([^\]]+)\]/g, function( match, codepoint ) {
return '(?:' + codepoint.replace( /;&/g, ';|&' ) + ')';
} );
regex += '// START: emoji regex\n';
regex += '\t$codepoints = \'/(' + emojiRegex + ')/u\';\n';
regex += '\t$entities = \'/(' + entities + ')/u\';\n';
regex += '\t// END: emoji regex';
return regex;
}
}
]
},
files: [
{
expand: true,
flatten: true,
src: [
SOURCE_DIR + 'wp-includes/formatting.php'
],
dest: SOURCE_DIR + 'wp-includes/'
}
]
}
},
_watch: {
all: {
files: [
@ -718,6 +809,10 @@ module.exports = function(grunt) {
'phpunit'
] );
grunt.registerTask( 'precommit:emoji', [
'replace:emojiRegex'
] );
grunt.registerTask( 'precommit', 'Runs test and build tasks in preparation for a commit', function() {
var done = this.async();
var map = {
@ -783,6 +878,11 @@ module.exports = function(grunt) {
taskList.push( 'precommit:' + extension );
}
} );
if ( [ 'twemoji.js' ].some( testPath ) ) {
grunt.log.writeln( 'twemoji.js has updated. Running `precommit:emoji.' );
taskList.push( 'precommit:emoji' );
}
}
grunt.task.run( taskList );

View File

@ -30,6 +30,7 @@
"grunt-legacy-util": "^0.2.0",
"grunt-patch-wordpress": "~0.4.2",
"grunt-postcss": "~0.7.1",
"grunt-replace": "~1.0.1",
"grunt-rtlcss": "~2.0.1",
"grunt-sass": "~1.2.1",
"matchdep": "~1.0.0"

File diff suppressed because one or more lines are too long

View File

@ -2,19 +2,21 @@
/**
* @group formatting
* @group emoji
*/
class Tests_Formatting_Emoji extends WP_UnitTestCase {
private $png_cdn = 'https://s.w.org/images/core/emoji/2.3/72x72/';
private $svn_cdn = 'https://s.w.org/images/core/emoji/2.3/svg/';
/**
* @ticket 36525
*/
public function test_unfiltered_emoji_cdns() {
$png_cdn = 'https://s.w.org/images/core/emoji/2.3/72x72/';
$svn_cdn = 'https://s.w.org/images/core/emoji/2.3/svg/';
$output = get_echo( '_print_emoji_detection_script' );
$this->assertContains( wp_json_encode( $png_cdn ), $output );
$this->assertContains( wp_json_encode( $svn_cdn ), $output );
$this->assertContains( wp_json_encode( $this->png_cdn ), $output );
$this->assertContains( wp_json_encode( $this->svn_cdn ), $output );
}
public function _filtered_emoji_svn_cdn( $cdn = '' ) {
@ -25,17 +27,14 @@ class Tests_Formatting_Emoji extends WP_UnitTestCase {
* @ticket 36525
*/
public function test_filtered_emoji_svn_cdn() {
$png_cdn = 'https://s.w.org/images/core/emoji/2.3/72x72/';
$svn_cdn = 'https://s.w.org/images/core/emoji/2.3/svg/';
$filtered_svn_cdn = $this->_filtered_emoji_svn_cdn();
add_filter( 'emoji_svg_url', array( $this, '_filtered_emoji_svn_cdn' ) );
$output = get_echo( '_print_emoji_detection_script' );
$this->assertContains( wp_json_encode( $png_cdn ), $output );
$this->assertNotContains( wp_json_encode( $svn_cdn ), $output );
$this->assertContains( wp_json_encode( $this->png_cdn ), $output );
$this->assertNotContains( wp_json_encode( $this->svn_cdn ), $output );
$this->assertContains( wp_json_encode( $filtered_svn_cdn ), $output );
remove_filter( 'emoji_svg_url', array( $this, '_filtered_emoji_svn_cdn' ) );
@ -49,9 +48,6 @@ class Tests_Formatting_Emoji extends WP_UnitTestCase {
* @ticket 36525
*/
public function test_filtered_emoji_png_cdn() {
$png_cdn = 'https://s.w.org/images/core/emoji/2.3/72x72/';
$svn_cdn = 'https://s.w.org/images/core/emoji/2.3/svg/';
$filtered_png_cdn = $this->_filtered_emoji_png_cdn();
add_filter( 'emoji_url', array( $this, '_filtered_emoji_png_cdn' ) );
@ -59,10 +55,95 @@ class Tests_Formatting_Emoji extends WP_UnitTestCase {
$output = get_echo( '_print_emoji_detection_script' );
$this->assertContains( wp_json_encode( $filtered_png_cdn ), $output );
$this->assertNotContains( wp_json_encode( $png_cdn ), $output );
$this->assertContains( wp_json_encode( $svn_cdn ), $output );
$this->assertNotContains( wp_json_encode( $this->png_cdn ), $output );
$this->assertContains( wp_json_encode( $this->svn_cdn ), $output );
remove_filter( 'emoji_url', array( $this, '_filtered_emoji_png_cdn' ) );
}
/**
* @ticket 35293
*/
public function test_wp_emoji_regex_returns_regexen() {
$default = wp_emoji_regex();
$this->assertNotEmpty( $default );
$codepoints = wp_emoji_regex( 'codepoints' );
$this->assertNotEmpty( $codepoints );
$this->assertSame( $default, $codepoints );
$entities = wp_emoji_regex( 'entities' );
$this->assertNotEmpty( $entities );
$this->assertNotSame( $default, $entities );
}
public function data_wp_encode_emoji() {
return array(
array(
// Not emoji
'',
'',
),
array(
// Simple emoji
'🙂',
'&#x1f642;',
),
array(
// Skin tone, gender, ZWJ, emoji selector
'👮🏼‍♀️',
'&#x1f46e;&#x1f3fc;&#x200d;&#x2640;&#xfe0f;',
),
array(
// Unicode 10
'🧚',
'&#x1f9da;',
),
);
}
/**
* @ticket 35293
* @dataProvider data_wp_encode_emoji
*/
public function test_wp_encode_emoji( $emoji, $expected ) {
$this->assertSame( $expected, wp_encode_emoji( $emoji ) );
}
public function data_wp_staticize_emoji() {
return array(
array(
// Not emoji
'',
'',
),
array(
// Simple emoji
'🙂',
'<img src="' . $this->png_cdn . '1f642.png" alt="🙂" class="wp-smiley" style="height: 1em; max-height: 1em;" />',
),
array(
// Skin tone, gender, ZWJ, emoji selector
'👮🏼‍♀️',
'<img src="' . $this->png_cdn . '1f46e-1f3fc-200d-2640-fe0f.png" alt="👮🏼‍♀️" class="wp-smiley" style="height: 1em; max-height: 1em;" />',
),
array(
// Unicode 10
'🧚',
'<img src="' . $this->png_cdn . '1f9da.png" alt="🧚" class="wp-smiley" style="height: 1em; max-height: 1em;" />',
),
);
}
/**
* @ticket 35293
* @dataProvider data_wp_staticize_emoji
*/
public function test_wp_staticize_emoji( $emoji, $expected ) {
$this->assertSame( $expected, wp_staticize_emoji( $emoji ) );
}
}

View File

@ -2,6 +2,7 @@
/**
* @group formatting
* @group emoji
*/
class Tests_Formatting_Smilies extends WP_UnitTestCase {