Use RegEx instead of DOMDocument when protecting <pre> tags in WP_oEmbed::_strip_newlines(). It is incredibly difficult to maintain character encoding and whitespace when parsing via DOMDocument.

See #31214.


git-svn-id: https://develop.svn.wordpress.org/trunk@31423 602fd350-edb4-49c9-b593-d223f7449a82
This commit is contained in:
Scott Taylor 2015-02-11 22:17:27 +00:00
parent d00ef0f076
commit a93805f6bb

View File

@ -563,28 +563,27 @@ class WP_oEmbed {
return $html; return $html;
} }
$pre = array(); $count = 1;
$tokens = array(); $found = array();
if ( class_exists( 'DOMDocument' ) ) { $token = '__PRE__';
$token = '__PRE__'; $search = array( "\t", "\n", "\r", ' ' );
$replace = array(); $replace = array( '__TAB__', '__NL__', '__CR__', '__SPACE__' );
$count = 1; $tokenized = str_replace( $search, $replace, $html );
$dom = new DOMDocument(); preg_match_all( '#(<pre[^>]*>.+?</pre>)#i', $tokenized, $matches, PREG_SET_ORDER );
$dom->loadHTML( $html ); foreach ( $matches as $i => $match ) {
$tags = $dom->getElementsByTagName( 'pre' ); $tag_html = str_replace( $replace, $search, $match[0] );
foreach ( $tags as $i => $tag ) { $tag_token = $token . $i;
$tag_html = $dom->saveHTML( $tag );
$tag_token = $token . $i;
$replace[ $tag_token ] = $tag_html;
$html = str_replace( $tag_html, $tag_token, $html, $count ); $found[ $tag_token ] = $tag_html;
} $html = str_replace( $tag_html, $tag_token, $html, $count );
$pre = array_values( $replace );
$tokens = array_keys( $replace );
} }
$stripped = str_replace( array( "\r\n", "\n" ), '', $html ); $replaced = str_replace( $replace, $search, $html );
$stripped = str_replace( array( "\r\n", "\n" ), '', $replaced );
$pre = array_values( $found );
$tokens = array_keys( $found );
return str_replace( $tokens, $pre, $stripped ); return str_replace( $tokens, $pre, $stripped );
} }
} }