From e2951f5744363ccc350576f4636eab882b18465a Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Fri, 15 Nov 2019 16:10:35 +0100 Subject: [PATCH 01/67] Add initial test for meta tag sanitization --- tests/php/test-class-amp-meta-sanitizer.php | 74 +++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 tests/php/test-class-amp-meta-sanitizer.php diff --git a/tests/php/test-class-amp-meta-sanitizer.php b/tests/php/test-class-amp-meta-sanitizer.php new file mode 100644 index 00000000000..0d3a5092e6e --- /dev/null +++ b/tests/php/test-class-amp-meta-sanitizer.php @@ -0,0 +1,74 @@ +', + '' + ], + [ + '', + '' + ], + [ + '', + '' + ], + [ + '', + '' + ], + ]; + } + + /** + * Tests the sanitize method. + * + * @dataProvider get_data_for_sanitize + * @covers \AMP_Meta_Sanitizer::sanitize() + * + * @param string $source_content Source DOM content. + * @param string $expected_content Expected content after sanitization. + */ + public function test_sanitize( $source_content, $expected_content ) { + $dom = AMP_DOM_Utils::get_dom( $source_content ); + $sanitizer = new AMP_Meta_Sanitizer( $dom ); + $sanitizer->sanitize(); + + $this->assertEqualMarkup( $expected_content, $dom->saveHTML() ); + } + + /** + * Assert markup is equal. + * + * @param string $expected Expected markup. + * @param string $actual Actual markup. + */ + public function assertEqualMarkup( $expected, $actual ) { + $actual = preg_replace( '/\s+/', ' ', $actual ); + $expected = preg_replace( '/\s+/', ' ', $expected ); + $actual = preg_replace( '/(?<=>)\s+(?=<)/', '', trim( $actual ) ); + $expected = preg_replace( '/(?<=>)\s+(?=<)/', '', trim( $expected ) ); + + $this->assertEquals( + array_filter( preg_split( '#(<[^>]+>|[^<>]+)#', $expected, -1, PREG_SPLIT_DELIM_CAPTURE ) ), + array_filter( preg_split( '#(<[^>]+>|[^<>]+)#', $actual, -1, PREG_SPLIT_DELIM_CAPTURE ) ) + ); + } +} From 395169dcced7af7ab6148e358c685019b1f80b93 Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Fri, 15 Nov 2019 16:11:14 +0100 Subject: [PATCH 02/67] Add initial implementation of meta tag sanitizer --- includes/amp-helper-functions.php | 1 + includes/class-amp-autoloader.php | 1 + .../sanitizers/class-amp-meta-sanitizer.php | 116 ++++++++++++++++++ 3 files changed, 118 insertions(+) create mode 100644 includes/sanitizers/class-amp-meta-sanitizer.php diff --git a/includes/amp-helper-functions.php b/includes/amp-helper-functions.php index 67cc7878c09..6c42c60f91f 100644 --- a/includes/amp-helper-functions.php +++ b/includes/amp-helper-functions.php @@ -915,6 +915,7 @@ function amp_get_content_sanitizers( $post = null ) { 'AMP_Style_Sanitizer' => [ 'include_manifest_comment' => ( defined( 'WP_DEBUG' ) && WP_DEBUG ) ? 'always' : 'when_excessive', ], + 'AMP_Meta_Sanitizer' => [], 'AMP_Tag_And_Attribute_Sanitizer' => [], // Note: This whitelist sanitizer must come at the end to clean up any remaining issues the other sanitizers didn't catch. ]; diff --git a/includes/class-amp-autoloader.php b/includes/class-amp-autoloader.php index 80b0fe4e765..c6241e87671 100644 --- a/includes/class-amp-autoloader.php +++ b/includes/class-amp-autoloader.php @@ -78,6 +78,7 @@ class AMP_Autoloader { 'AMP_Iframe_Sanitizer' => 'includes/sanitizers/class-amp-iframe-sanitizer', 'AMP_Img_Sanitizer' => 'includes/sanitizers/class-amp-img-sanitizer', 'AMP_Link_Sanitizer' => 'includes/sanitizers/class-amp-link-sanitizer', + 'AMP_Meta_Sanitizer' => 'includes/sanitizers/class-amp-meta-sanitizer', 'AMP_Nav_Menu_Toggle_Sanitizer' => 'includes/sanitizers/class-amp-nav-menu-toggle-sanitizer', 'AMP_Nav_Menu_Dropdown_Sanitizer' => 'includes/sanitizers/class-amp-nav-menu-dropdown-sanitizer', 'AMP_Comments_Sanitizer' => 'includes/sanitizers/class-amp-comments-sanitizer', diff --git a/includes/sanitizers/class-amp-meta-sanitizer.php b/includes/sanitizers/class-amp-meta-sanitizer.php new file mode 100644 index 00000000000..1eccbf7fc3a --- /dev/null +++ b/includes/sanitizers/class-amp-meta-sanitizer.php @@ -0,0 +1,116 @@ + tag to identify and replace with AMP version. + */ + public static $tag = 'meta'; + + /** + * Placeholder for default arguments, to be set in child classes. + * + * @var array + */ + protected $DEFAULT_ARGS = [ // phpcs:ignore WordPress.NamingConventions.ValidVariableName.PropertyNotSnakeCase + 'use_document_element' => true, // We want to work on the header, so we need the entire document. + ]; + + CONST AMP_CHARSET = 'utf-8'; + + /** + * Sanitize. + */ + public function sanitize() { + foreach ( $this->dom->getElementsByTagName( static::$tag ) as $element ) { + $this->sanitize_element( $element ); + } + + $charset = $this->ensure_charset_is_present(); + + if ( static::AMP_CHARSET !== $charset ) { + // @TODO Re-encode the content into UTF-8. + // ... sure? + } + } + + /** + * Sanitize an individual meta tag. + * + * @param DOMElement $element Meta tag to sanitize. + */ + protected function sanitize_element( DOMElement $element ) { + // Handle HTML 4 http-equiv meta tags. + if ( 'content-type' === strtolower( $element->getAttribute( 'http-equiv' ) ) ) { + $charset = $element->getAttribute( 'charset' ); + if ( $charset ) { + // If we have a charset attribute included, use that as a separate tag. + $element->parentNode->appendChild( $this->create_charset_node( $charset ) ); + } else { + // If not, check whether the charset is included with the content type, and use that. + $content = $element->getAttribute( 'content' ); + $matches = []; + if ( preg_match( '/;\s*charset=(?[^;]+)/', $content, $matches ) && ! empty( $matches['charset'] ) ) { + $element->parentNode->appendChild( $this->create_charset_node( $matches['charset'] ) ); + } + } + // In case we haven't found a charset by now, a default utf-8 one will be added in a later step. + + // Always remove the HTML 4 http-equiv tag. + $element->parentNode->removeChild( $element ); + } + } + + /** + * Always ensure that we have an HTML 5 charset meta tag. + * + * The charset defaults to utf-8, which is also what AMP requires. + * + * @return string The charset that was detected or added. + */ + protected function ensure_charset_is_present() { + $xpath = New DOMXPath( $this->dom ); + + // Bail early if we already have a meta charset. + $charset_element = $xpath->query( '//meta[ @charset ]' )->item( 0 ); + if ( $charset_element ) { + return $charset_element->getAttribute( 'charset' ); + } + + $parent = $xpath->query( '//html/head' )->item( 0 ); + if ( ! $parent ) { + // We did not detect the actual head node to attach the meta tag to, so we just + // add it to the document and assume the other sanitizers will figure it out. + $parent = $this->dom->childNodes->item( 0 ); + } + + // No charset found, so add the default one. + $parent->appendChild( $this->create_charset_node( static::AMP_CHARSET ) ); + } + + /** + * Create a new meta tag for the charset value. + * + * @param string $charset Character set to use. + * @return DOMElement New meta tag with requested charset. + */ + protected function create_charset_node( $charset ) { + $charset_node = $this->dom->createElement( 'meta' ); + $charset_node->setAttribute( 'charset', strtolower( $charset ) ); + return $charset_node; + } +} From a569500c151a2d1ffa3a74b3b4220074dbff75df Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Fri, 15 Nov 2019 16:15:00 +0100 Subject: [PATCH 03/67] Satisfy PHPCS --- includes/sanitizers/class-amp-meta-sanitizer.php | 11 ++++++++--- tests/php/test-class-amp-meta-sanitizer.php | 8 ++++---- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/includes/sanitizers/class-amp-meta-sanitizer.php b/includes/sanitizers/class-amp-meta-sanitizer.php index 1eccbf7fc3a..ced82cffb31 100644 --- a/includes/sanitizers/class-amp-meta-sanitizer.php +++ b/includes/sanitizers/class-amp-meta-sanitizer.php @@ -30,7 +30,12 @@ class AMP_Meta_Sanitizer extends AMP_Base_Sanitizer { 'use_document_element' => true, // We want to work on the header, so we need the entire document. ]; - CONST AMP_CHARSET = 'utf-8'; + /** + * Charset to use for AMP markup. + * + * @var string + */ + const AMP_CHARSET = 'utf-8'; /** * Sanitize. @@ -42,7 +47,7 @@ public function sanitize() { $charset = $this->ensure_charset_is_present(); - if ( static::AMP_CHARSET !== $charset ) { + if ( static::AMP_CHARSET !== $charset ) { // phpcs:ignore Generic.CodeAnalysis.EmptyStatement // @TODO Re-encode the content into UTF-8. // ... sure? } @@ -83,7 +88,7 @@ protected function sanitize_element( DOMElement $element ) { * @return string The charset that was detected or added. */ protected function ensure_charset_is_present() { - $xpath = New DOMXPath( $this->dom ); + $xpath = new DOMXPath( $this->dom ); // Bail early if we already have a meta charset. $charset_element = $xpath->query( '//meta[ @charset ]' )->item( 0 ); diff --git a/tests/php/test-class-amp-meta-sanitizer.php b/tests/php/test-class-amp-meta-sanitizer.php index 0d3a5092e6e..0348a6e7637 100644 --- a/tests/php/test-class-amp-meta-sanitizer.php +++ b/tests/php/test-class-amp-meta-sanitizer.php @@ -20,19 +20,19 @@ public function get_data_for_sanitize() { // Turn HTML 4 charset into HTML 5 charset. [ '', - '' + '', ], [ '', - '' + '', ], [ '', - '' + '', ], [ '', - '' + '', ], ]; } From 043187a8374c870eeee0f647c1e1d4d3fb9c7260 Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Fri, 15 Nov 2019 16:24:32 +0100 Subject: [PATCH 04/67] Accept uppercase UTF-8 charset --- includes/sanitizers/class-amp-meta-sanitizer.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/includes/sanitizers/class-amp-meta-sanitizer.php b/includes/sanitizers/class-amp-meta-sanitizer.php index ced82cffb31..537175e5696 100644 --- a/includes/sanitizers/class-amp-meta-sanitizer.php +++ b/includes/sanitizers/class-amp-meta-sanitizer.php @@ -47,7 +47,7 @@ public function sanitize() { $charset = $this->ensure_charset_is_present(); - if ( static::AMP_CHARSET !== $charset ) { // phpcs:ignore Generic.CodeAnalysis.EmptyStatement + if ( static::AMP_CHARSET !== strtolower( $charset ) ) { // phpcs:ignore Generic.CodeAnalysis.EmptyStatement // @TODO Re-encode the content into UTF-8. // ... sure? } From 847c75320a6c2b43cd3befce3c31e40d2870b3f3 Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Fri, 15 Nov 2019 16:33:11 +0100 Subject: [PATCH 05/67] Improve tests --- tests/php/test-class-amp-meta-sanitizer.php | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/php/test-class-amp-meta-sanitizer.php b/tests/php/test-class-amp-meta-sanitizer.php index 0348a6e7637..cf2c5e45727 100644 --- a/tests/php/test-class-amp-meta-sanitizer.php +++ b/tests/php/test-class-amp-meta-sanitizer.php @@ -17,11 +17,17 @@ class Test_AMP_Meta_Sanitizer extends WP_UnitTestCase { */ public function get_data_for_sanitize() { return [ - // Turn HTML 4 charset into HTML 5 charset. + // Don't break the correct charset tag. [ '', '', ], + // Add default charset tag if none is present. + [ + '', + '', + ], + // Turn HTML 4 charset tag into HTML 5 charset tag. [ '', '', From 834f26cbcbdc64c6f8497dba01b7b61ccf369584 Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Fri, 15 Nov 2019 17:51:32 +0100 Subject: [PATCH 06/67] Extract more code out of AMP_Theme_Support --- includes/class-amp-theme-support.php | 40 +----- .../sanitizers/class-amp-meta-sanitizer.php | 134 +++++++++++++++--- tests/php/test-class-amp-meta-sanitizer.php | 46 ++++-- 3 files changed, 148 insertions(+), 72 deletions(-) diff --git a/includes/class-amp-theme-support.php b/includes/class-amp-theme-support.php index 0df3b892bd4..0d35ae4ee30 100644 --- a/includes/class-amp-theme-support.php +++ b/includes/class-amp-theme-support.php @@ -1616,12 +1616,7 @@ public static function ensure_required_markup( DOMDocument $dom, $script_handles $xpath = new DOMXPath( $dom ); - // Make sure the HEAD element is in the doc. $head = $dom->getElementsByTagName( 'head' )->item( 0 ); - if ( ! $head ) { - $head = $dom->createElement( 'head' ); - $dom->documentElement->insertBefore( $head, $dom->documentElement->firstChild ); - } // Ensure there is a schema.org script in the document. // @todo Consider applying the amp_schemaorg_metadata filter on the contents when a script is already present. @@ -1679,14 +1674,11 @@ public static function ensure_required_markup( DOMDocument $dom, $script_handles * * {@link https://amp.dev/documentation/guides-and-tutorials/optimize-and-measure/optimize_amp/ Optimize the AMP Runtime loading} */ - $meta_charset = null; $meta_viewport = null; $meta_amp_script_srcs = []; $meta_elements = []; foreach ( $head->getElementsByTagName( 'meta' ) as $meta ) { - if ( $meta->hasAttribute( 'charset' ) ) { // There will not be a meta[http-equiv] because the sanitizer removed it. - $meta_charset = $meta; - } elseif ( 'viewport' === $meta->getAttribute( 'name' ) ) { + if ( 'viewport' === $meta->getAttribute( 'name' ) ) { $meta_viewport = $meta; } elseif ( 'amp-script-src' === $meta->getAttribute( 'name' ) ) { $meta_amp_script_srcs[] = $meta; @@ -1695,36 +1687,6 @@ public static function ensure_required_markup( DOMDocument $dom, $script_handles } } - // Handle meta charset. - if ( ! $meta_charset ) { - // Warning: This probably means the character encoding needs to be converted. - $meta_charset = AMP_DOM_Utils::create_node( - $dom, - 'meta', - [ - 'charset' => 'utf-8', - ] - ); - } else { - $head->removeChild( $meta_charset ); // So we can move it. - } - $head->insertBefore( $meta_charset, $head->firstChild ); - - // Handle meta viewport. - if ( ! $meta_viewport ) { - $meta_viewport = AMP_DOM_Utils::create_node( - $dom, - 'meta', - [ - 'name' => 'viewport', - 'content' => 'width=device-width', - ] - ); - } else { - $head->removeChild( $meta_viewport ); // So we can move it. - } - $head->insertBefore( $meta_viewport, $meta_charset->nextSibling ); - // Handle meta amp-script-src elements. $first_meta_amp_script_src = array_shift( $meta_amp_script_srcs ); if ( $first_meta_amp_script_src ) { diff --git a/includes/sanitizers/class-amp-meta-sanitizer.php b/includes/sanitizers/class-amp-meta-sanitizer.php index 537175e5696..b8d463a9afe 100644 --- a/includes/sanitizers/class-amp-meta-sanitizer.php +++ b/includes/sanitizers/class-amp-meta-sanitizer.php @@ -30,6 +30,20 @@ class AMP_Meta_Sanitizer extends AMP_Base_Sanitizer { 'use_document_element' => true, // We want to work on the header, so we need the entire document. ]; + /** + * Reference to the shared XPath object to query the DOM. + * + * @var DOMXPath + */ + protected $xpath; + + /** + * The document's element. + * + * @var DOMElement + */ + protected $head; + /** * Charset to use for AMP markup. * @@ -37,20 +51,48 @@ class AMP_Meta_Sanitizer extends AMP_Base_Sanitizer { */ const AMP_CHARSET = 'utf-8'; + /** + * Viewport settings to use for AMP markup. + * + * @var string + */ + const AMP_VIEWPORT = 'width=device-width'; + /** * Sanitize. */ public function sanitize() { + $this->xpath = new DOMXPath( $this->dom ); + $this->head = $this->ensure_head_is_present(); + foreach ( $this->dom->getElementsByTagName( static::$tag ) as $element ) { $this->sanitize_element( $element ); } - $charset = $this->ensure_charset_is_present(); + $charset_element = $this->ensure_charset_is_present_and_first_in_head(); - if ( static::AMP_CHARSET !== strtolower( $charset ) ) { // phpcs:ignore Generic.CodeAnalysis.EmptyStatement + if ( ! $this->is_correct_charset( $charset_element ) ) { // phpcs:ignore Generic.CodeAnalysis.EmptyStatement // @TODO Re-encode the content into UTF-8. // ... sure? } + + $this->ensure_viewport_is_present_and_after_charset( $charset_element ); + } + + /** + * Ensure that the element is present in the document. + * + * @return DOMElement The document's element. + */ + protected function ensure_head_is_present() { + $head = $this->dom->getElementsByTagName( 'head' )->item( 0 ); + + if ( ! $head ) { + $head = $this->dom->createElement( 'head' ); + $head = $this->dom->documentElement->insertBefore( $head, $this->dom->documentElement->firstChild ); + } + + return $head; } /** @@ -64,13 +106,13 @@ protected function sanitize_element( DOMElement $element ) { $charset = $element->getAttribute( 'charset' ); if ( $charset ) { // If we have a charset attribute included, use that as a separate tag. - $element->parentNode->appendChild( $this->create_charset_node( $charset ) ); + $element->parentNode->appendChild( $this->create_charset_element( $charset ) ); } else { // If not, check whether the charset is included with the content type, and use that. $content = $element->getAttribute( 'content' ); $matches = []; if ( preg_match( '/;\s*charset=(?[^;]+)/', $content, $matches ) && ! empty( $matches['charset'] ) ) { - $element->parentNode->appendChild( $this->create_charset_node( $matches['charset'] ) ); + $element->parentNode->appendChild( $this->create_charset_element( $matches['charset'] ) ); } } // In case we haven't found a charset by now, a default utf-8 one will be added in a later step. @@ -81,30 +123,45 @@ protected function sanitize_element( DOMElement $element ) { } /** - * Always ensure that we have an HTML 5 charset meta tag. + * Always ensure that we have an HTML 5 charset meta tag, and force it to be the first in . * * The charset defaults to utf-8, which is also what AMP requires. * - * @return string The charset that was detected or added. + * @return DOMElement The charset element that was detected or added. */ - protected function ensure_charset_is_present() { - $xpath = new DOMXPath( $this->dom ); - - // Bail early if we already have a meta charset. - $charset_element = $xpath->query( '//meta[ @charset ]' )->item( 0 ); + protected function ensure_charset_is_present_and_first_in_head() { + // Retrieve the charset element or create a new one. + $charset_element = $this->xpath->query( '//meta[ @charset ]' )->item( 0 ); if ( $charset_element ) { - return $charset_element->getAttribute( 'charset' ); + $charset_element->parentNode->removeChild( $charset_element ); // So that we can move it. + } else { + $charset_element = $this->create_charset_element( static::AMP_CHARSET ); } - $parent = $xpath->query( '//html/head' )->item( 0 ); - if ( ! $parent ) { - // We did not detect the actual head node to attach the meta tag to, so we just - // add it to the document and assume the other sanitizers will figure it out. - $parent = $this->dom->childNodes->item( 0 ); + // (Re)insert the charset as first element of the head. + $charset_element = $this->head->insertBefore( $charset_element, $this->head->firstChild ); + + return $charset_element; + } + + /** + * Always ensure we have a viewport tag and force it to be the second in (after charset). + * + * The viewport defaults to 'width=device-width', which is the bare minimum that AMP requires. + * + * @param DOMElement $charset_element The charset meta tag element to append the viewport to. + */ + protected function ensure_viewport_is_present_and_after_charset( DOMElement $charset_element ) { + // Retrieve the viewport element or create a new one. + $viewport_element = $this->xpath->query( '//meta[ @name = "viewport" ]' )->item( 0 ); + if ( $viewport_element ) { + $viewport_element->parentNode->removeChild( $viewport_element ); // So that we can move it. + } else { + $viewport_element = $this->create_viewport_element( static::AMP_VIEWPORT ); } - // No charset found, so add the default one. - $parent->appendChild( $this->create_charset_node( static::AMP_CHARSET ) ); + // (Re)insert the viewport as first element of the head. + $this->head->insertBefore( $viewport_element, $charset_element->nextSibling ); } /** @@ -113,9 +170,40 @@ protected function ensure_charset_is_present() { * @param string $charset Character set to use. * @return DOMElement New meta tag with requested charset. */ - protected function create_charset_node( $charset ) { - $charset_node = $this->dom->createElement( 'meta' ); - $charset_node->setAttribute( 'charset', strtolower( $charset ) ); - return $charset_node; + protected function create_charset_element( $charset ) { + return AMP_DOM_Utils::create_node( + $this->dom, + 'meta', + [ + 'charset' => strtolower( $charset ), + ] + ); + } + + /** + * Create a new meta tag for the viewport setting. + * + * @param string $viewport Viewport setting to use. + * @return DOMElement New meta tag with requested viewport setting. + */ + protected function create_viewport_element( $viewport ) { + return AMP_DOM_Utils::create_node( + $this->dom, + 'meta', + [ + 'name' => 'viewport', + 'content' => $viewport, + ] + ); + } + + /** + * Check whether the charset is the correct one according to AMP requirements. + * + * @param DOMElement $charset_element Charset meta tag element. + * @return bool Whether the charset is the correct one. + */ + protected function is_correct_charset( DOMElement $charset_element ) { + return static::AMP_CHARSET === strtolower( $charset_element->getAttribute( 'charset' ) ); } } diff --git a/tests/php/test-class-amp-meta-sanitizer.php b/tests/php/test-class-amp-meta-sanitizer.php index cf2c5e45727..1466918f095 100644 --- a/tests/php/test-class-amp-meta-sanitizer.php +++ b/tests/php/test-class-amp-meta-sanitizer.php @@ -17,28 +17,54 @@ class Test_AMP_Meta_Sanitizer extends WP_UnitTestCase { */ public function get_data_for_sanitize() { return [ - // Don't break the correct charset tag. + // Don't break the correct charset tag (note the caps, which deviates from the default). [ - '', - '', + '', + '', ], + + // Don't break the correct viewport tag (note the scale, which deviates from the default). + [ + '', + '', + ], + + // Move charset and viewport tags from body to head. + [ + '', + '', + ], + // Add default charset tag if none is present. [ - '', + '', + '', + ], + + // Add default viewport tag if none is present. + [ '', + '', + ], + + // Make sure charset is the first meta tag. + [ + '', + '', ], + // Turn HTML 4 charset tag into HTML 5 charset tag. [ - '', - '', + '', + '', ], [ - '', - '', + '', + '', ], [ - '', - '', + '', + '', ], ]; } From 1ceb73922d5041ca4de7940ddf225bfe4146081b Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Fri, 15 Nov 2019 18:07:42 +0100 Subject: [PATCH 07/67] Get rid of viewport references in AMP_Theme_Support --- includes/class-amp-theme-support.php | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/includes/class-amp-theme-support.php b/includes/class-amp-theme-support.php index 0d35ae4ee30..1adec99899f 100644 --- a/includes/class-amp-theme-support.php +++ b/includes/class-amp-theme-support.php @@ -1674,15 +1674,12 @@ public static function ensure_required_markup( DOMDocument $dom, $script_handles * * {@link https://amp.dev/documentation/guides-and-tutorials/optimize-and-measure/optimize_amp/ Optimize the AMP Runtime loading} */ - $meta_viewport = null; $meta_amp_script_srcs = []; $meta_elements = []; foreach ( $head->getElementsByTagName( 'meta' ) as $meta ) { - if ( 'viewport' === $meta->getAttribute( 'name' ) ) { - $meta_viewport = $meta; - } elseif ( 'amp-script-src' === $meta->getAttribute( 'name' ) ) { + if ( 'amp-script-src' === $meta->getAttribute( 'name' ) ) { $meta_amp_script_srcs[] = $meta; - } else { + } elseif ( ! $meta->hasAttribute( 'charset' ) && 'viewport' !== $meta->getAttribute( 'name' ) ) { $meta_elements[] = $meta; } } @@ -1706,7 +1703,8 @@ public static function ensure_required_markup( DOMDocument $dom, $script_handles unset( $meta_amp_script_srcs, $first_meta_amp_script_src ); // Insert all the the meta elements next in the head. - $previous_node = $meta_viewport; + // We already sanitized the meta tags to enforce the charset to be index 0 and the viewport to be index 1. + $previous_node = $head->childNodes->item( 1 ); // The viewport node. foreach ( $meta_elements as $meta_element ) { $meta_element->parentNode->removeChild( $meta_element ); $head->insertBefore( $meta_element, $previous_node->nextSibling ); From ab1b29a3a16e961c2cd3fb5a57177cde7b1c1163 Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Mon, 18 Nov 2019 11:09:26 +0100 Subject: [PATCH 08/67] Add additional scripts support --- .../sanitizers/class-amp-meta-sanitizer.php | 51 +++++++++++++++++-- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/includes/sanitizers/class-amp-meta-sanitizer.php b/includes/sanitizers/class-amp-meta-sanitizer.php index b8d463a9afe..9435225f246 100644 --- a/includes/sanitizers/class-amp-meta-sanitizer.php +++ b/includes/sanitizers/class-amp-meta-sanitizer.php @@ -76,7 +76,9 @@ public function sanitize() { // ... sure? } - $this->ensure_viewport_is_present_and_after_charset( $charset_element ); + $viewport_element = $this->ensure_viewport_is_present_and_after_charset( $charset_element ); + + $this->process_amp_script_meta_tags( $viewport_element ); } /** @@ -139,9 +141,7 @@ protected function ensure_charset_is_present_and_first_in_head() { } // (Re)insert the charset as first element of the head. - $charset_element = $this->head->insertBefore( $charset_element, $this->head->firstChild ); - - return $charset_element; + return $this->head->insertBefore( $charset_element, $this->head->firstChild ); } /** @@ -150,6 +150,8 @@ protected function ensure_charset_is_present_and_first_in_head() { * The viewport defaults to 'width=device-width', which is the bare minimum that AMP requires. * * @param DOMElement $charset_element The charset meta tag element to append the viewport to. + * + * @return DOMElement The viewport element that was detected or added. */ protected function ensure_viewport_is_present_and_after_charset( DOMElement $charset_element ) { // Retrieve the viewport element or create a new one. @@ -161,7 +163,46 @@ protected function ensure_viewport_is_present_and_after_charset( DOMElement $cha } // (Re)insert the viewport as first element of the head. - $this->head->insertBefore( $viewport_element, $charset_element->nextSibling ); + return $this->head->insertBefore( $viewport_element, $charset_element->nextSibling ); + } + + protected function process_amp_script_meta_tags( DOMElement $previous_element ) { + $meta_amp_script_srcs = []; + $meta_elements = []; + foreach ( $this->head->getElementsByTagName( 'meta' ) as $meta ) { + if ( 'amp-script-src' === $meta->getAttribute( 'name' ) ) { + $meta_amp_script_srcs[] = $meta; + } elseif ( ! $meta->hasAttribute( 'charset' ) && 'viewport' !== $meta->getAttribute( 'name' ) ) { + $meta_elements[] = $meta; + } + } + + // Handle meta amp-script-src elements. + $first_meta_amp_script_src = array_shift( $meta_amp_script_srcs ); + if ( $first_meta_amp_script_src ) { + $meta_elements[] = $first_meta_amp_script_src; + + // Merge (and remove) any subsequent meta amp-script-src elements. + if ( ! empty( $meta_amp_script_srcs ) ) { + $content_values = [ $first_meta_amp_script_src->getAttribute( 'content' ) ]; + foreach ( $meta_amp_script_srcs as $meta_amp_script_src ) { + $meta_amp_script_src->parentNode->removeChild( $meta_amp_script_src ); + $content_values[] = $meta_amp_script_src->getAttribute( 'content' ); + } + $first_meta_amp_script_src->setAttribute( 'content', implode( ' ', $content_values ) ); + unset( $meta_amp_script_src, $content_values ); + } + } + unset( $meta_amp_script_srcs, $first_meta_amp_script_src ); + + // Insert all the the meta elements next in the head. + // We already sanitized the meta tags to enforce the charset to be index 0 and the viewport to be index 1. + $previous_node = $this->head->childNodes->item( 1 ); // The viewport node. + foreach ( $meta_elements as $meta_element ) { + $meta_element->parentNode->removeChild( $meta_element ); + $this->head->insertBefore( $meta_element, $previous_node->nextSibling ); + $previous_node = $meta_element; + } } /** From 97d6207fe3ee76686e4785ded127cb8a58977f8c Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Mon, 18 Nov 2019 19:19:20 +0100 Subject: [PATCH 09/67] Refactor sanitizer to reduce DOM queries and manipulations --- includes/class-amp-theme-support.php | 2 +- .../sanitizers/class-amp-meta-sanitizer.php | 233 ++++++++++++------ tests/php/test-class-amp-dom-utils.php | 4 +- tests/php/test-class-amp-meta-sanitizer.php | 20 +- tests/php/test-class-amp-theme-support.php | 8 +- 5 files changed, 170 insertions(+), 97 deletions(-) diff --git a/includes/class-amp-theme-support.php b/includes/class-amp-theme-support.php index 1adec99899f..3501762c587 100644 --- a/includes/class-amp-theme-support.php +++ b/includes/class-amp-theme-support.php @@ -1679,7 +1679,7 @@ public static function ensure_required_markup( DOMDocument $dom, $script_handles foreach ( $head->getElementsByTagName( 'meta' ) as $meta ) { if ( 'amp-script-src' === $meta->getAttribute( 'name' ) ) { $meta_amp_script_srcs[] = $meta; - } elseif ( ! $meta->hasAttribute( 'charset' ) && 'viewport' !== $meta->getAttribute( 'name' ) ) { + } elseif ( ! $meta->hasAttribute( 'charset' ) && 'viewport' !== $meta->getAttribute( 'name' ) ) { $meta_elements[] = $meta; } } diff --git a/includes/sanitizers/class-amp-meta-sanitizer.php b/includes/sanitizers/class-amp-meta-sanitizer.php index 9435225f246..1b4342f3f1c 100644 --- a/includes/sanitizers/class-amp-meta-sanitizer.php +++ b/includes/sanitizers/class-amp-meta-sanitizer.php @@ -44,6 +44,35 @@ class AMP_Meta_Sanitizer extends AMP_Base_Sanitizer { */ protected $head; + /* + * Tags array keys. + */ + const TAG_CHARSET = 'charset'; + const TAG_VIEWPORT = 'viewport'; + const TAG_AMP_SCRIPT_SRC = 'amp_script_src'; + const TAG_OTHER = 'other'; + + /** + * Associative array of DOMElement arrays. + * + * Each key in the root level defines one group of meta tags to process. + * + * @var array $tags { + * An array of meta tag groupings. + * + * @type DOMElement[] $charset Charset meta tag(s). + * @type DOMElement[] $viewport Viewport meta tag(s). + * @type DOMElement[] $amp_script_src source meta tags. + * @type DOMElement[] $other Remaining meta tags. + * } + */ + protected $meta_tags = [ + self::TAG_CHARSET => [], + self::TAG_VIEWPORT => [], + self::TAG_AMP_SCRIPT_SRC => [], + self::TAG_OTHER => [], + ]; + /** * Charset to use for AMP markup. * @@ -65,20 +94,38 @@ public function sanitize() { $this->xpath = new DOMXPath( $this->dom ); $this->head = $this->ensure_head_is_present(); + $charset = $this->detect_charset(); + foreach ( $this->dom->getElementsByTagName( static::$tag ) as $element ) { - $this->sanitize_element( $element ); + /** + * Meta tag to process. + * + * @var DOMElement $element + */ + $element = $element->parentNode->removeChild( $element ); + if ( $element->hasAttribute( 'charset' ) ) { + $this->meta_tags[ self::TAG_CHARSET ][] = $element; + } elseif ( 'viewport' === $element->getAttribute( 'name' ) ) { + $this->meta_tags[ self::TAG_VIEWPORT ][] = $element; + } elseif ( 'amp-script-src' === $element->getAttribute( 'name' ) ) { + $this->meta_tags[ self::TAG_AMP_SCRIPT_SRC ][] = $element; + } else { + $this->meta_tags[ self::TAG_OTHER ][] = $element; + } } - $charset_element = $this->ensure_charset_is_present_and_first_in_head(); + $this->ensure_charset_is_present( $charset ); - if ( ! $this->is_correct_charset( $charset_element ) ) { // phpcs:ignore Generic.CodeAnalysis.EmptyStatement + if ( ! $this->is_correct_charset() ) { // phpcs:ignore Generic.CodeAnalysis.EmptyStatement // @TODO Re-encode the content into UTF-8. // ... sure? } - $viewport_element = $this->ensure_viewport_is_present_and_after_charset( $charset_element ); + $this->ensure_viewport_is_present(); - $this->process_amp_script_meta_tags( $viewport_element ); + $this->process_amp_script_meta_tags(); + + $this->re_add_meta_tags_in_optimized_order(); } /** @@ -98,111 +145,109 @@ protected function ensure_head_is_present() { } /** - * Sanitize an individual meta tag. + * Detect the charset of the document. * - * @param DOMElement $element Meta tag to sanitize. + * @return string|false Detected charset of the document, or false if none. */ - protected function sanitize_element( DOMElement $element ) { - // Handle HTML 4 http-equiv meta tags. - if ( 'content-type' === strtolower( $element->getAttribute( 'http-equiv' ) ) ) { - $charset = $element->getAttribute( 'charset' ); - if ( $charset ) { - // If we have a charset attribute included, use that as a separate tag. - $element->parentNode->appendChild( $this->create_charset_element( $charset ) ); - } else { + protected function detect_charset() { + $charset = false; + + // Check for HTML 4 http-equiv meta tags. + $http_equiv_tag = $this->xpath->query( '//meta[ @http-equiv and @content ]' )->item( 0 ); + if ( $http_equiv_tag ) { + $http_equiv_tag->parentNode->removeChild( $http_equiv_tag ); + + // Check for the existence of a proper charset attribute first. + $charset = $http_equiv_tag->getAttribute( 'charset' ); + if ( ! $charset ) { // If not, check whether the charset is included with the content type, and use that. - $content = $element->getAttribute( 'content' ); + $content = $http_equiv_tag->getAttribute( 'content' ); + $matches = []; if ( preg_match( '/;\s*charset=(?[^;]+)/', $content, $matches ) && ! empty( $matches['charset'] ) ) { - $element->parentNode->appendChild( $this->create_charset_element( $matches['charset'] ) ); + $charset = $matches['charset']; } } - // In case we haven't found a charset by now, a default utf-8 one will be added in a later step. + } - // Always remove the HTML 4 http-equiv tag. - $element->parentNode->removeChild( $element ); + // Check for HTML 5 charset meta tag. This overrides the HTML 4 charset. + $charset_tag = $this->xpath->query( '//meta[ @charset ]' )->item( 0 ); + if ( $charset_tag ) { + $charset_tag = $charset_tag->parentNode->removeChild( $charset_tag ); + $charset = $charset_tag->getAttribute( 'charset' ); } + + return $charset; } /** - * Always ensure that we have an HTML 5 charset meta tag, and force it to be the first in . + * Always ensure that we have an HTML 5 charset meta tag. * * The charset defaults to utf-8, which is also what AMP requires. * - * @return DOMElement The charset element that was detected or added. + * @param string|false $charset Optional. Charset that was already detected. False if none. Defaults to false. */ - protected function ensure_charset_is_present_and_first_in_head() { - // Retrieve the charset element or create a new one. - $charset_element = $this->xpath->query( '//meta[ @charset ]' )->item( 0 ); - if ( $charset_element ) { - $charset_element->parentNode->removeChild( $charset_element ); // So that we can move it. - } else { - $charset_element = $this->create_charset_element( static::AMP_CHARSET ); + protected function ensure_charset_is_present( $charset = false ) { + if ( ! empty( $this->meta_tags[ self::TAG_CHARSET ] ) ) { + return; } - // (Re)insert the charset as first element of the head. - return $this->head->insertBefore( $charset_element, $this->head->firstChild ); + $this->meta_tags[ self::TAG_CHARSET ][] = $this->create_charset_element( $charset ?: static::AMP_CHARSET ); } /** - * Always ensure we have a viewport tag and force it to be the second in (after charset). + * Always ensure we have a viewport tag. * * The viewport defaults to 'width=device-width', which is the bare minimum that AMP requires. - * - * @param DOMElement $charset_element The charset meta tag element to append the viewport to. - * - * @return DOMElement The viewport element that was detected or added. */ - protected function ensure_viewport_is_present_and_after_charset( DOMElement $charset_element ) { - // Retrieve the viewport element or create a new one. - $viewport_element = $this->xpath->query( '//meta[ @name = "viewport" ]' )->item( 0 ); - if ( $viewport_element ) { - $viewport_element->parentNode->removeChild( $viewport_element ); // So that we can move it. - } else { - $viewport_element = $this->create_viewport_element( static::AMP_VIEWPORT ); + protected function ensure_viewport_is_present() { + if ( empty( $this->meta_tags[ self::TAG_VIEWPORT ] ) ) { + $this->meta_tags[ self::TAG_VIEWPORT ][] = $this->create_viewport_element( static::AMP_VIEWPORT ); + return; } - // (Re)insert the viewport as first element of the head. - return $this->head->insertBefore( $viewport_element, $charset_element->nextSibling ); - } + // Ensure we have the 'width=device-width' setting included. + $viewport_tag = $this->meta_tags[ self::TAG_VIEWPORT ][0]; + $viewport_content = $viewport_tag->getAttribute( 'content' ); + $viewport_settings = array_map( 'trim', explode( ',', $viewport_content ) ); + $width_found = false; - protected function process_amp_script_meta_tags( DOMElement $previous_element ) { - $meta_amp_script_srcs = []; - $meta_elements = []; - foreach ( $this->head->getElementsByTagName( 'meta' ) as $meta ) { - if ( 'amp-script-src' === $meta->getAttribute( 'name' ) ) { - $meta_amp_script_srcs[] = $meta; - } elseif ( ! $meta->hasAttribute( 'charset' ) && 'viewport' !== $meta->getAttribute( 'name' ) ) { - $meta_elements[] = $meta; + foreach ( $viewport_settings as $index => $viewport_setting ) { + list( $property, $value ) = array_map( 'trim', explode( '=', $viewport_setting ) ); + if ( 'width' === $property ) { + if ( 'device-width' !== $value ) { + $viewport_settings[ $index ] = 'width=device-width'; + } + $width_found = true; + break; } } - // Handle meta amp-script-src elements. - $first_meta_amp_script_src = array_shift( $meta_amp_script_srcs ); - if ( $first_meta_amp_script_src ) { - $meta_elements[] = $first_meta_amp_script_src; - - // Merge (and remove) any subsequent meta amp-script-src elements. - if ( ! empty( $meta_amp_script_srcs ) ) { - $content_values = [ $first_meta_amp_script_src->getAttribute( 'content' ) ]; - foreach ( $meta_amp_script_srcs as $meta_amp_script_src ) { - $meta_amp_script_src->parentNode->removeChild( $meta_amp_script_src ); - $content_values[] = $meta_amp_script_src->getAttribute( 'content' ); - } - $first_meta_amp_script_src->setAttribute( 'content', implode( ' ', $content_values ) ); - unset( $meta_amp_script_src, $content_values ); - } + if ( ! $width_found ) { + array_unshift( $viewport_settings, 'width=device-width' ); } - unset( $meta_amp_script_srcs, $first_meta_amp_script_src ); - - // Insert all the the meta elements next in the head. - // We already sanitized the meta tags to enforce the charset to be index 0 and the viewport to be index 1. - $previous_node = $this->head->childNodes->item( 1 ); // The viewport node. - foreach ( $meta_elements as $meta_element ) { - $meta_element->parentNode->removeChild( $meta_element ); - $this->head->insertBefore( $meta_element, $previous_node->nextSibling ); - $previous_node = $meta_element; + + $viewport_tag->setAttribute( 'content', implode( ',', $viewport_settings ) ); + } + + /** + * Parse and concatenate source meta tags. + */ + protected function process_amp_script_meta_tags() { + if ( empty( $this->meta_tags[ self::TAG_AMP_SCRIPT_SRC ] ) ) { + return; } + + $first_meta_amp_script_src = array_shift( $this->meta_tags[ self::TAG_AMP_SCRIPT_SRC ] ); + $content_values = [ $first_meta_amp_script_src->getAttribute( 'content' ) ]; + + // Merge (and remove) any subsequent meta amp-script-src elements. + while ( ! empty ( $this->meta_tags[ self::TAG_AMP_SCRIPT_SRC ] ) ) { + $meta_amp_script_src = array_shift( $this->meta_tags[ self::TAG_AMP_SCRIPT_SRC ] ); + $content_values[] = $meta_amp_script_src->getAttribute( 'content' ); + } + + $first_meta_amp_script_src->setAttribute( 'content', implode( ' ', $content_values ) ); } /** @@ -241,10 +286,38 @@ protected function create_viewport_element( $viewport ) { /** * Check whether the charset is the correct one according to AMP requirements. * - * @param DOMElement $charset_element Charset meta tag element. * @return bool Whether the charset is the correct one. */ - protected function is_correct_charset( DOMElement $charset_element ) { + protected function is_correct_charset() { + if ( empty( $this->meta_tags[ self::TAG_CHARSET ] ) ) { + throw new LogicException( 'Failed to ensure a charset meta tag is present' ); + } + + $charset_element = $this->meta_tags[ self::TAG_CHARSET ][0]; + return static::AMP_CHARSET === strtolower( $charset_element->getAttribute( 'charset' ) ); } + + /** + * Re-add the meta tags to the node in the optimized order. + * + * The order is defined by the array entries in $this->meta_tags. + */ + protected function re_add_meta_tags_in_optimized_order() { + /** + * Previous meta tag to append to. + * + * @var DOMElement $previous_meta_tag + */ + $previous_meta_tag = null; + foreach ( $this->meta_tags as $meta_tag_group ) { + foreach ( $meta_tag_group as $meta_tag ) { + if ( $previous_meta_tag ) { + $previous_meta_tag = $this->head->insertBefore( $meta_tag, $previous_meta_tag->nextSibling ); + } else { + $previous_meta_tag = $this->head->insertBefore( $meta_tag, $this->head->firstChild ); + } + } + } + } } diff --git a/tests/php/test-class-amp-dom-utils.php b/tests/php/test-class-amp-dom-utils.php index 99999974a90..9170e7ee61d 100644 --- a/tests/php/test-class-amp-dom-utils.php +++ b/tests/php/test-class-amp-dom-utils.php @@ -299,7 +299,7 @@ public function test_mustache_replacements() { * @covers \AMP_DOM_Utils::get_dom() */ public function test_get_dom_encoding() { - $html = 'مرحبا بالعالم! Check out ‘this’ and “that” and—other things.'; + $html = 'مرحبا بالعالم! Check out ‘this’ and “that” and—other things.'; $html .= '

مرحبا بالعالم! Check out ‘this’ and “that” and—other things.

'; $html .= '

مرحبا بالعالم! Check out ‘this’ and “that” and—other things.

'; $html .= '

مرحبا بالعالم! Check out ‘this’ and “that” and—other things.

'; @@ -307,7 +307,7 @@ public function test_get_dom_encoding() { $document = AMP_DOM_Utils::get_dom( $html ); - $this->assertEquals( 'UTF-8', $document->encoding ); + $this->assertEquals( 'utf-8', $document->encoding ); $paragraphs = $document->getElementsByTagName( 'p' ); $this->assertSame( 3, $paragraphs->length ); $this->assertSame( $paragraphs->item( 0 )->textContent, $paragraphs->item( 1 )->textContent ); diff --git a/tests/php/test-class-amp-meta-sanitizer.php b/tests/php/test-class-amp-meta-sanitizer.php index 1466918f095..0dbc3ba9028 100644 --- a/tests/php/test-class-amp-meta-sanitizer.php +++ b/tests/php/test-class-amp-meta-sanitizer.php @@ -17,22 +17,22 @@ class Test_AMP_Meta_Sanitizer extends WP_UnitTestCase { */ public function get_data_for_sanitize() { return [ - // Don't break the correct charset tag (note the caps, which deviates from the default). + // Don't break the correct charset tag. [ - '', - '', + '', + '', ], - // Don't break the correct viewport tag (note the scale, which deviates from the default). + // Don't break the correct viewport tag. [ - '', - '', + '', + '', ], // Move charset and viewport tags from body to head. [ - '', - '', + '', + '', ], // Add default charset tag if none is present. @@ -59,11 +59,11 @@ public function get_data_for_sanitize() { '', ], [ - '', + '', '', ], [ - '', + '', '', ], ]; diff --git a/tests/php/test-class-amp-theme-support.php b/tests/php/test-class-amp-theme-support.php index a073c9dfda8..bcad2ca47e5 100644 --- a/tests/php/test-class-amp-theme-support.php +++ b/tests/php/test-class-amp-theme-support.php @@ -555,7 +555,7 @@ public function test_validate_non_amp_theme() { - + @@ -1935,7 +1935,7 @@ static function ( $url ) { $ordered_contains = [ '', - '', + '', '', '', $output ); + $this->assertContains( '', $output ); // HTML with doctype, comments, and whitespace before head. $input = " \n\n \nHello"; $output = AMP_Theme_Support::prepare_response( $input ); $this->assertContains( 'assertContains( '', $output ); + $this->assertContains( '', $output ); } /** From e68f592370994362c6dfbfb2516af8f71072cfa8 Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Mon, 18 Nov 2019 19:27:40 +0100 Subject: [PATCH 10/67] Fix PHPCS issues --- includes/sanitizers/class-amp-meta-sanitizer.php | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/includes/sanitizers/class-amp-meta-sanitizer.php b/includes/sanitizers/class-amp-meta-sanitizer.php index 1b4342f3f1c..5c824c1c3d1 100644 --- a/includes/sanitizers/class-amp-meta-sanitizer.php +++ b/includes/sanitizers/class-amp-meta-sanitizer.php @@ -239,10 +239,10 @@ protected function process_amp_script_meta_tags() { } $first_meta_amp_script_src = array_shift( $this->meta_tags[ self::TAG_AMP_SCRIPT_SRC ] ); - $content_values = [ $first_meta_amp_script_src->getAttribute( 'content' ) ]; + $content_values = [ $first_meta_amp_script_src->getAttribute( 'content' ) ]; // Merge (and remove) any subsequent meta amp-script-src elements. - while ( ! empty ( $this->meta_tags[ self::TAG_AMP_SCRIPT_SRC ] ) ) { + while ( ! empty( $this->meta_tags[ self::TAG_AMP_SCRIPT_SRC ] ) ) { $meta_amp_script_src = array_shift( $this->meta_tags[ self::TAG_AMP_SCRIPT_SRC ] ); $content_values[] = $meta_amp_script_src->getAttribute( 'content' ); } @@ -287,10 +287,11 @@ protected function create_viewport_element( $viewport ) { * Check whether the charset is the correct one according to AMP requirements. * * @return bool Whether the charset is the correct one. + * @throws LengthException If the charset meta tag was not previously retrieved. */ protected function is_correct_charset() { if ( empty( $this->meta_tags[ self::TAG_CHARSET ] ) ) { - throw new LogicException( 'Failed to ensure a charset meta tag is present' ); + throw new LengthException( 'Failed to ensure a charset meta tag is present' ); } $charset_element = $this->meta_tags[ self::TAG_CHARSET ][0]; From d726fee89aac04d53958e34071c56ce4e9ef28ca Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Tue, 19 Nov 2019 08:48:11 +0100 Subject: [PATCH 11/67] Remove meta tag handling from AMP_Theme_Support --- includes/class-amp-theme-support.php | 48 ---------------------------- 1 file changed, 48 deletions(-) diff --git a/includes/class-amp-theme-support.php b/includes/class-amp-theme-support.php index 3501762c587..12efb8d5b36 100644 --- a/includes/class-amp-theme-support.php +++ b/includes/class-amp-theme-support.php @@ -1663,54 +1663,6 @@ public static function ensure_required_markup( DOMDocument $dom, $script_handles $head->appendChild( $rel_canonical ); } - /* - * Ensure meta charset and meta viewport are present. - * - * "AMP is already quite restrictive about which markup is allowed in the section. However, - * there are a few basic optimizations that you can apply. The key is to structure the section - * in a way so that all render-blocking scripts and custom fonts load as fast as possible." - * - * "1. The first tag should be the meta charset tag, followed by any remaining meta tags." - * - * {@link https://amp.dev/documentation/guides-and-tutorials/optimize-and-measure/optimize_amp/ Optimize the AMP Runtime loading} - */ - $meta_amp_script_srcs = []; - $meta_elements = []; - foreach ( $head->getElementsByTagName( 'meta' ) as $meta ) { - if ( 'amp-script-src' === $meta->getAttribute( 'name' ) ) { - $meta_amp_script_srcs[] = $meta; - } elseif ( ! $meta->hasAttribute( 'charset' ) && 'viewport' !== $meta->getAttribute( 'name' ) ) { - $meta_elements[] = $meta; - } - } - - // Handle meta amp-script-src elements. - $first_meta_amp_script_src = array_shift( $meta_amp_script_srcs ); - if ( $first_meta_amp_script_src ) { - $meta_elements[] = $first_meta_amp_script_src; - - // Merge (and remove) any subsequent meta amp-script-src elements. - if ( ! empty( $meta_amp_script_srcs ) ) { - $content_values = [ $first_meta_amp_script_src->getAttribute( 'content' ) ]; - foreach ( $meta_amp_script_srcs as $meta_amp_script_src ) { - $meta_amp_script_src->parentNode->removeChild( $meta_amp_script_src ); - $content_values[] = $meta_amp_script_src->getAttribute( 'content' ); - } - $first_meta_amp_script_src->setAttribute( 'content', implode( ' ', $content_values ) ); - unset( $meta_amp_script_src, $content_values ); - } - } - unset( $meta_amp_script_srcs, $first_meta_amp_script_src ); - - // Insert all the the meta elements next in the head. - // We already sanitized the meta tags to enforce the charset to be index 0 and the viewport to be index 1. - $previous_node = $head->childNodes->item( 1 ); // The viewport node. - foreach ( $meta_elements as $meta_element ) { - $meta_element->parentNode->removeChild( $meta_element ); - $head->insertBefore( $meta_element, $previous_node->nextSibling ); - $previous_node = $meta_element; - } - // Handle the title. $title = $head->getElementsByTagName( 'title' )->item( 0 ); if ( $title ) { From e388a223297ed962b243e6428a9de03815df886d Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Tue, 19 Nov 2019 10:07:53 +0100 Subject: [PATCH 12/67] Add and test amp-script-src handling --- includes/class-amp-theme-support.php | 4 ++ .../sanitizers/class-amp-meta-sanitizer.php | 12 +++-- tests/php/test-class-amp-meta-sanitizer.php | 16 ++++++ tests/php/test-class-amp-theme-support.php | 50 ------------------- 4 files changed, 29 insertions(+), 53 deletions(-) diff --git a/includes/class-amp-theme-support.php b/includes/class-amp-theme-support.php index 12efb8d5b36..95aca3438b0 100644 --- a/includes/class-amp-theme-support.php +++ b/includes/class-amp-theme-support.php @@ -1663,6 +1663,10 @@ public static function ensure_required_markup( DOMDocument $dom, $script_handles $head->appendChild( $rel_canonical ); } + // Store the last meta tag as the previous node to append to. + $meta_tags = $head->getElementsByTagName( 'meta' ); + $previous_node = $meta_tags->length > 0 ? $meta_tags->item( $meta_tags->length - 1 ) : $head->firstChild; + // Handle the title. $title = $head->getElementsByTagName( 'title' )->item( 0 ); if ( $title ) { diff --git a/includes/sanitizers/class-amp-meta-sanitizer.php b/includes/sanitizers/class-amp-meta-sanitizer.php index 5c824c1c3d1..9de20dd00f3 100644 --- a/includes/sanitizers/class-amp-meta-sanitizer.php +++ b/includes/sanitizers/class-amp-meta-sanitizer.php @@ -93,16 +93,20 @@ class AMP_Meta_Sanitizer extends AMP_Base_Sanitizer { public function sanitize() { $this->xpath = new DOMXPath( $this->dom ); $this->head = $this->ensure_head_is_present(); + $charset = $this->detect_charset(); + $elements = $this->dom->getElementsByTagName( static::$tag ); - $charset = $this->detect_charset(); + // Remove all nodes for easy reordering later on. + $elements = array_map( static function ( $element ) { + return $element->parentNode->removeChild( $element ); + }, iterator_to_array( $elements, false ) ); - foreach ( $this->dom->getElementsByTagName( static::$tag ) as $element ) { + foreach ( $elements as $element ) { /** * Meta tag to process. * * @var DOMElement $element */ - $element = $element->parentNode->removeChild( $element ); if ( $element->hasAttribute( 'charset' ) ) { $this->meta_tags[ self::TAG_CHARSET ][] = $element; } elseif ( 'viewport' === $element->getAttribute( 'name' ) ) { @@ -248,6 +252,8 @@ protected function process_amp_script_meta_tags() { } $first_meta_amp_script_src->setAttribute( 'content', implode( ' ', $content_values ) ); + + $this->meta_tags[ self::TAG_AMP_SCRIPT_SRC ][] = $first_meta_amp_script_src; } /** diff --git a/tests/php/test-class-amp-meta-sanitizer.php b/tests/php/test-class-amp-meta-sanitizer.php index 0dbc3ba9028..d35498fd406 100644 --- a/tests/php/test-class-amp-meta-sanitizer.php +++ b/tests/php/test-class-amp-meta-sanitizer.php @@ -16,6 +16,16 @@ class Test_AMP_Meta_Sanitizer extends WP_UnitTestCase { * @return array[] Array of arrays with test data. */ public function get_data_for_sanitize() { + $script1 = 'document.body.textContent += "First!";'; + $script2 = 'document.body.textContent += "Second!";'; + $script3 = 'document.body.textContent += "Third!";'; + $script4 = 'document.body.textContent += "Fourth! (And forbidden because no amp-script-src meta in head.)";'; + + $script1_hash = amp_generate_script_hash( $script1 ); + $script2_hash = amp_generate_script_hash( $script2 ); + $script3_hash = amp_generate_script_hash( $script3 ); + $script4_hash = amp_generate_script_hash( $script4 ); + return [ // Don't break the correct charset tag. [ @@ -66,6 +76,12 @@ public function get_data_for_sanitize() { '', '', ], + + // Concatenate and reposition script hashes. + [ + '', + '', + ], ]; } diff --git a/tests/php/test-class-amp-theme-support.php b/tests/php/test-class-amp-theme-support.php index bcad2ca47e5..3e1d0aa4c40 100644 --- a/tests/php/test-class-amp-theme-support.php +++ b/tests/php/test-class-amp-theme-support.php @@ -2117,56 +2117,6 @@ public function test_prepare_response_for_submitted_form() { unset( $_SERVER['REQUEST_METHOD'] ); } - /** - * Test combining amp-script-src meta tags. - * - * @covers \amp_generate_script_hash() - */ - public function test_prepare_response_for_amp_script() { - $script1 = 'document.body.textContent += "First!";'; - $script2 = 'document.body.textContent += "Second!";'; - $script3 = 'document.body.textContent += "Third!";'; - $script4 = 'document.body.textContent += "Fourth! (And forbidden because no amp-script-src meta in head.)";'; - - $script1_hash = amp_generate_script_hash( $script1 ); - $script2_hash = amp_generate_script_hash( $script2 ); - $script3_hash = amp_generate_script_hash( $script3 ); - $script4_hash = amp_generate_script_hash( $script4 ); - - ob_start(); - ?> - - - - - - - - - - - - - - - - - query( '/html/head/meta[ @name = "amp-script-src" ]' ); - $this->assertSame( 1, $meta_elements->length ); - - $meta = $meta_elements->item( 0 ); - $this->assertSame( "$script1_hash $script2_hash $script3_hash", $meta->getAttribute( 'content' ) ); - } - /** * Test post-processor cache effectiveness in AMP_Theme_Support::prepare_response(). */ From 71780f03d954eb2acf73bcc32edb5e3457b79339 Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Tue, 19 Nov 2019 10:18:23 +0100 Subject: [PATCH 13/67] Fix PHPCS issues --- includes/sanitizers/class-amp-meta-sanitizer.php | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/includes/sanitizers/class-amp-meta-sanitizer.php b/includes/sanitizers/class-amp-meta-sanitizer.php index 9de20dd00f3..a025caa59f4 100644 --- a/includes/sanitizers/class-amp-meta-sanitizer.php +++ b/includes/sanitizers/class-amp-meta-sanitizer.php @@ -97,9 +97,12 @@ public function sanitize() { $elements = $this->dom->getElementsByTagName( static::$tag ); // Remove all nodes for easy reordering later on. - $elements = array_map( static function ( $element ) { - return $element->parentNode->removeChild( $element ); - }, iterator_to_array( $elements, false ) ); + $elements = array_map( + static function ( $element ) { + return $element->parentNode->removeChild( $element ); + }, + iterator_to_array( $elements, false ) + ); foreach ( $elements as $element ) { /** From 7192d679a3ca68b88f93bd298bf1134ff008a0a8 Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Tue, 19 Nov 2019 18:13:56 +0100 Subject: [PATCH 14/67] Modify broken test in AMP_Theme_Support --- tests/php/test-class-amp-theme-support.php | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/php/test-class-amp-theme-support.php b/tests/php/test-class-amp-theme-support.php index 3e1d0aa4c40..95ff4e31c0f 100644 --- a/tests/php/test-class-amp-theme-support.php +++ b/tests/php/test-class-amp-theme-support.php @@ -568,11 +568,11 @@ public function test_validate_non_amp_theme() { $original_html = trim( ob_get_clean() ); $sanitized_html = AMP_Theme_Support::prepare_response( $original_html ); - // Invalid viewport meta tag is not present. + // Insufficient viewport tag was not left in. $this->assertNotContains( '', $sanitized_html ); - // Correct viewport meta tag was added. - $this->assertContains( '', $sanitized_html ); + // Viewport tag was modified to include all requirements. + $this->assertContains( '', $sanitized_html ); // MathML script was added. $this->assertContains( '', $sanitized_html ); // phpcs:ignore WordPress.WP.EnqueuedResources.NonEnqueuedScript From 7b57648cbfd815ad70466ed04bef5afed426b529 Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Wed, 20 Nov 2019 10:04:44 +0100 Subject: [PATCH 15/67] Apply suggestions from code review - allow for non-lower-case "http-equiv" - trim charset Co-Authored-By: Weston Ruter --- includes/sanitizers/class-amp-meta-sanitizer.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/includes/sanitizers/class-amp-meta-sanitizer.php b/includes/sanitizers/class-amp-meta-sanitizer.php index a025caa59f4..9274c9ab9b2 100644 --- a/includes/sanitizers/class-amp-meta-sanitizer.php +++ b/includes/sanitizers/class-amp-meta-sanitizer.php @@ -160,7 +160,7 @@ protected function detect_charset() { $charset = false; // Check for HTML 4 http-equiv meta tags. - $http_equiv_tag = $this->xpath->query( '//meta[ @http-equiv and @content ]' )->item( 0 ); + $http_equiv_tag = $this->xpath->query( '//meta[ translate(@http-equiv, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'content-type' and @content ]' )->item( 0 ); if ( $http_equiv_tag ) { $http_equiv_tag->parentNode->removeChild( $http_equiv_tag ); @@ -172,7 +172,7 @@ protected function detect_charset() { $matches = []; if ( preg_match( '/;\s*charset=(?[^;]+)/', $content, $matches ) && ! empty( $matches['charset'] ) ) { - $charset = $matches['charset']; + $charset = trim( $matches['charset'] ); } } } From c36eff4732eee42b00299172a03fe0a8b9834b0c Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Wed, 20 Nov 2019 10:07:24 +0100 Subject: [PATCH 16/67] Add explanatory comment to reordering --- includes/sanitizers/class-amp-meta-sanitizer.php | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/includes/sanitizers/class-amp-meta-sanitizer.php b/includes/sanitizers/class-amp-meta-sanitizer.php index 9274c9ab9b2..78b1cfdc222 100644 --- a/includes/sanitizers/class-amp-meta-sanitizer.php +++ b/includes/sanitizers/class-amp-meta-sanitizer.php @@ -312,6 +312,11 @@ protected function is_correct_charset() { * Re-add the meta tags to the node in the optimized order. * * The order is defined by the array entries in $this->meta_tags. + * + * The optimal loading order for AMP pages is documented at: + * https://amp.dev/documentation/guides-and-tutorials/optimize-and-measure/optimize_amp/#optimize-the-amp-runtime-loading + * + * "1. The first tag should be the meta charset tag, followed by any remaining meta tags." */ protected function re_add_meta_tags_in_optimized_order() { /** From 8efc5648ff387b6c397a26281eac792edb9571f0 Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Wed, 20 Nov 2019 10:11:41 +0100 Subject: [PATCH 17/67] Remove pass-by-reference on $dom Objects are always passed by reference. --- includes/templates/class-amp-content-sanitizer.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/includes/templates/class-amp-content-sanitizer.php b/includes/templates/class-amp-content-sanitizer.php index 6fca3c5e5a2..2ba12d2106d 100644 --- a/includes/templates/class-amp-content-sanitizer.php +++ b/includes/templates/class-amp-content-sanitizer.php @@ -56,7 +56,7 @@ public static function sanitize( $content, array $sanitizer_classes, $global_arg * @type array $styles Styles. If $args['return_styles'] is not empty. For legacy purposes. * } */ - public static function sanitize_document( &$dom, $sanitizer_classes, $args ) { + public static function sanitize_document( $dom, $sanitizer_classes, $args ) { $scripts = []; $stylesheets = []; $styles = []; From 4e952d9581fbb4202cd0b87dc1e15d65e112c953 Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Fri, 22 Nov 2019 18:30:36 +0100 Subject: [PATCH 18/67] Add AMP_DOM_Document abstraction --- includes/class-amp-autoloader.php | 1 + .../sanitizers/class-amp-meta-sanitizer.php | 2 +- includes/utils/class-amp-dom-document.php | 304 ++++++++++++++++++ includes/utils/class-amp-dom-utils.php | 92 ++---- tests/php/test-class-amp-dom-utils.php | 4 +- 5 files changed, 326 insertions(+), 77 deletions(-) create mode 100644 includes/utils/class-amp-dom-document.php diff --git a/includes/class-amp-autoloader.php b/includes/class-amp-autoloader.php index c6241e87671..568470228cc 100644 --- a/includes/class-amp-autoloader.php +++ b/includes/class-amp-autoloader.php @@ -97,6 +97,7 @@ class AMP_Autoloader { 'AMP_Content' => 'includes/templates/class-amp-content', 'AMP_Content_Sanitizer' => 'includes/templates/class-amp-content-sanitizer', 'AMP_Post_Template' => 'includes/templates/class-amp-post-template', + 'AMP_DOM_Document' => 'includes/utils/class-amp-dom-document', 'AMP_DOM_Utils' => 'includes/utils/class-amp-dom-utils', 'AMP_HTML_Utils' => 'includes/utils/class-amp-html-utils', 'AMP_Image_Dimension_Extractor' => 'includes/utils/class-amp-image-dimension-extractor', diff --git a/includes/sanitizers/class-amp-meta-sanitizer.php b/includes/sanitizers/class-amp-meta-sanitizer.php index 78b1cfdc222..256273a564e 100644 --- a/includes/sanitizers/class-amp-meta-sanitizer.php +++ b/includes/sanitizers/class-amp-meta-sanitizer.php @@ -160,7 +160,7 @@ protected function detect_charset() { $charset = false; // Check for HTML 4 http-equiv meta tags. - $http_equiv_tag = $this->xpath->query( '//meta[ translate(@http-equiv, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'content-type' and @content ]' )->item( 0 ); + $http_equiv_tag = $this->xpath->query( '//meta[ translate(@http-equiv, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz") = "content-type" and @content ]' )->item( 0 ); if ( $http_equiv_tag ) { $http_equiv_tag->parentNode->removeChild( $http_equiv_tag ); diff --git a/includes/utils/class-amp-dom-document.php b/includes/utils/class-amp-dom-document.php new file mode 100644 index 00000000000..c11fa8b564a --- /dev/null +++ b/includes/utils/class-amp-dom-document.php @@ -0,0 +1,304 @@ +]*?\s*http-equiv=[^>]*?>[^<]*(?:<\/meta>)?/i'; + + /** + * Regular expression pattern to match the charset meta tag. + */ + const CHARSET_META_TAG_PATTERN = '/]*?\s*charset=[^>]*?>[^<]*(?:<\/meta>)?/i'; + + /** + * ID of the hacky charset we need to add to make loadHTML() behave. + */ + const CHARSET_HACK_ID = '--amp-dom-document-charset--'; + + /** + * The original encoding of how the AMP_DOM_Document was created. + * + * This is stored to do an automatic conversion to UTF-8, which is + * a requirement for AMP. + * + * @var string + */ + private $original_encoding; + + /** + * Associative array of encoding mappings. + * + * Translates HTML charsets into encodings PHP can understand. + * + * @var string[] + */ + private $encoding_map = [ + 'latin-1' => 'iso-8859-1', + ]; + + /** + * Creates a new AMP_DOM_Document object + * + * @link https://php.net/manual/domdocument.construct.php + * + * @param string $version Optional. The version number of the document as part of the XML declaration. + * @param string $encoding Optional. The encoding of the document as part of the XML declaration. + */ + public function __construct( $version = '', $encoding = null ) { + $this->original_encoding = (string) $encoding ?: self::UNKNOWN_ENCODING; + parent::__construct( $version ?: '1.0', self::AMP_ENCODING ); + } + + /** + * Load HTML from a string. + * + * @link https://php.net/manual/domdocument.loadhtml.php + * + * @param string $source The HTML string. + * @param int|string $options Optional. Specify additional Libxml parameters. + * @return bool true on success or false on failure. + */ + public function loadHTML( $source, $options = 0 ) { + $source = $this->maybe_add_head_or_body( $source ); + + $this->original_encoding = $this->detect_and_strip_encoding( $source ); + + if ( self::AMP_ENCODING !== strtolower( $this->original_encoding ) ) { + $source = $this->adapt_encoding( $source ); + } + + // Force-add http-equiv charset to make DOMDocument behave as it should. + // See: http://php.net/manual/en/domdocument.loadhtml.php#78243 + $source = str_replace( + '', + '', + $source + ); + + $success = parent::loadHTML( $source, $options ); + + if ( $success ) { + $this->encoding = self::AMP_ENCODING; + $head = $this->getElementsByTagName( 'head' )->item( 0 ); + $head->removeChild( $head->firstChild ); + } + + return $success; + } + + /** + * Dumps the internal document into a string using HTML formatting. + * + * @link https://php.net/manual/domdocument.savehtml.php + * + * @param DOMNode $node Optional. Parameter to output a subset of the document. + * @return string The HTML, or false if an error occurred. + */ + public function saveHTML( DOMNode $node = null ) { + // Force-add http-equiv charset to make DOMDocument behave as it should. + // See: http://php.net/manual/en/domdocument.loadhtml.php#78243 + $head = $this->getElementsByTagName( 'head' )->item( 0 ); + $charset = AMP_DOM_Utils::create_node( $this, 'meta', [ 'http-equiv' => 'content-type', 'content' => 'text/html; charset=' . self::AMP_ENCODING ] ); + $head->insertBefore( $charset, $head->firstChild ); + + return str_replace( '', '', parent::saveHTML( $node ) ); + } + + /** + * Maybe add the and/or tag(s). + * + * @param string $content Content to add the head or body tags to. + * @return string Adapted content. + */ + private function maybe_add_head_or_body( $content ) { + $substring = substr( $content, 0, 5000 ); + if ( false === strpos( $substring, '' ) ) { + // Create the required HTML structure if none exists yet. + $content = "{$content}"; + } else { + // seems to be present without . + $content = preg_replace( '#(.*)#', '$1', $content ); + } + } elseif ( false === strpos( $substring, '' ) ) { + // Create a element if none exists yet. + $content = str_replace( 'original_encoding ) { + $this->original_encoding = mb_detect_encoding( $source ); + } + + // Guessing the encoding failed, so we assume UTF-8. + if ( empty( $this->original_encoding ) ) { + $this->original_encoding = self::AMP_ENCODING; + } + + $this->original_encoding = $this->sanitize_encoding( $this->original_encoding ); + + $target = $source; //mb_convert_encoding( $source, self::AMP_ENCODING, $this->original_encoding ); + + return is_string( $target ) ? $target : $source; + } + + /** + * Detect the encoding of the document. + * + * @param string $content Content of which to detect the encoding. + * @return string|false Detected encoding of the document, or false if none. + */ + private function detect_and_strip_encoding( &$content ) { + $encoding = self::UNKNOWN_ENCODING; + + // Check for HTML 4 http-equiv meta tags. + if ( $http_equiv_tag = $this->find_tag( $content, 'meta', 'http-equiv' ) ) { + $encoding = $this->extract_value( $http_equiv_tag, 'charset' ); + } + + // Check for HTML 5 charset meta tag. This overrides the HTML 4 charset. + if ( $charset_tag = $this->find_tag( $content, 'meta', 'charset' ) ) { + $encoding = $this->extract_value( $charset_tag, 'charset' ); + } + + // Strip charset tags if they don't fit the AMP UTF-8 requirement. + if ( self::AMP_ENCODING !== strtolower( $encoding ) ) { + $http_equiv_tag && str_replace( $http_equiv_tag, '', $content ); + $charset_tag && str_replace( $charset_tag, '', $content ); + } + + return $encoding; + } + + /** + * Find a given tag with a given attribute. + * + * If multiple tags match, this method will only return the first one. + * + * @param string $content Content in which to find the tag. + * @param string $element Element of the tag. + * @param string $attribute Attribute that the tag contains. + * @return string|false The requested tag, or false if not found. + */ + private function find_tag( $content, $element, $attribute = null ) { + $matches = []; + $pattern = empty( $attribute ) + ? sprintf( + '/<%1$s[^>]*?>[^<]*(?:<\/%1$s>)?/i', + preg_quote( $element, '/' ) + ) + : sprintf( + '/<%1$s [^>]*?\s*%2$s=[^>]*?>[^<]*(?:<\/%1$s>)?/i', + preg_quote( $element, '/' ), + preg_quote( $attribute, '/' ) + ); + + if ( preg_match( $pattern, $content, $matches ) ) { + return $matches[0]; + } + + return false; + } + + /** + * Extract an attribute value from an HTML tag. + * + * @param string $tag Tag from which to extract the attribute. + * @param string $attribute Attribute of which to extract the value. + * @return string|false Extracted attribute value, false if not found. + */ + private function extract_value( $tag, $attribute ) { + $matches = []; + $pattern = sprintf( + '/%s=(?:([\'"])(?.*)?\1|(?[^ \'";]+))/', + preg_quote( $attribute, '/' ) + ); + + if ( preg_match( $pattern, $tag, $matches ) ) { + return empty( $matches['full'] ) ? $matches['partial'] : $matches['full']; + } + + return false; + } + + /** + * Sanitize the encoding that was detected. + * + * If sanitization fails, it will return 'auto', letting the conversion + * logic try to figure it out itself. + * + * @param string $encoding Encoding to sanitize. + * @return string Sanitized encoding. Falls back to 'auto' on failure. + */ + private function sanitize_encoding( $encoding ) { + static $known_encodings = null; + + if ( null === $known_encodings ) { + $known_encodings = array_map( 'strtolower', mb_list_encodings() ); + } + + if ( array_key_exists( $encoding, $this->encoding_map ) ) { + $encoding = $this->encoding_map[ $encoding ]; + } + + if ( ! in_array( $encoding, $known_encodings, true ) ) { + return 'auto'; + } + + return $encoding; + } + + /** + * Magic getter to implement lazily-created, cached properties for the document. + * + * @param string $name Name of the property to get. + * @return mixed Value of the property, or null if unknown property was requested. + */ + public function __get( $name ) { + switch ( $name ) { + case 'xpath': + $this->xpath = new DOMXPath( $this ); + return $this->xpath; + } + + // Mimic regular PHP behavior for missing notices. + trigger_error( "Undefined property: AMP_DOM_Document::${$name}", E_NOTICE ); + return null; + } +} diff --git a/includes/utils/class-amp-dom-utils.php b/includes/utils/class-amp-dom-utils.php index 68e40fdf73e..cc96d1f7234 100644 --- a/includes/utils/class-amp-dom-utils.php +++ b/includes/utils/class-amp-dom-utils.php @@ -98,12 +98,13 @@ class AMP_DOM_Utils { * @see AMP_DOM_Utils::get_content_from_dom_node() * * @param string $document Valid HTML document to be represented by a DOMDocument. + * @param string $encoding Optional. Encoding to use for the content. Defaults to `get_bloginfo( 'charset' )`. * @return DOMDocument|false Returns DOMDocument, or false if conversion failed. */ - public static function get_dom( $document ) { + public static function get_dom( $document, $encoding = null ) { $libxml_previous_state = libxml_use_internal_errors( true ); - $dom = new DOMDocument(); + $dom = new AMP_DOM_Document( '', $encoding ); // @todo In the future consider an AMP_DOMDocument subclass that does this automatically. See . $document = self::convert_amp_bind_attributes( $document ); @@ -116,7 +117,6 @@ public static function get_dom( $document ) { ); // Deal with bugs in older versions of libxml. - $added_back_compat_meta_content_type = false; if ( version_compare( LIBXML_DOTTED_VERSION, '2.8', '<' ) ) { /* * Replace noscript elements with placeholders since libxml<2.8 can parse them incorrectly. @@ -144,22 +144,6 @@ static function( $noscript_matches ) { ); } - /* - * Add a pre-HTML5-style declaration of the encoding since libxml doesn't always recognize - * HTML5's meta charset. In libxml<2.8 it never does, see . - * In libxml>=2.8, if the meta charset does not appear at the beginning of the head then it fails to be understood. - */ - $document = preg_replace( - '#(?=', - $document, - 1, - $count - ); - if ( 1 === $count ) { - $added_back_compat_meta_content_type = true; - } - /* * Wrap in dummy tags, since XML needs one parent node. * It also makes it easier to loop through nodes. @@ -175,26 +159,7 @@ static function( $noscript_matches ) { return false; } - // Remove pre-HTML5-style encoding declaration if added above. - if ( $added_back_compat_meta_content_type ) { - $meta_http_equiv_element = $dom->getElementById( 'meta-http-equiv-content-type' ); - if ( $meta_http_equiv_element ) { - $meta_http_equiv_element->parentNode->removeChild( $meta_http_equiv_element ); - } - } - - // Make sure there is a head and a body. - $head = $dom->getElementsByTagName( 'head' )->item( 0 ); - if ( ! $head ) { - $head = $dom->createElement( 'head' ); - $dom->documentElement->insertBefore( $head, $dom->documentElement->firstChild ); - } - $body = $dom->getElementsByTagName( 'body' )->item( 0 ); - if ( ! $body ) { - $body = $dom->createElement( 'body' ); - $dom->documentElement->appendChild( $body ); - } - self::move_invalid_head_nodes_to_body( $head, $body ); + self::move_invalid_head_nodes_to_body( $dom ); return $dom; } @@ -205,10 +170,12 @@ static function( $noscript_matches ) { * Apparently PHP's DOM is more lenient when parsing HTML to allow nodes in the HEAD which do not belong. A proper * HTML5 parser should rather prematurely short-circuit the HEAD when it finds an illegal element. * - * @param DOMElement $head HEAD element. - * @param DOMElement $body BODY element. + * @param DOMDocument DOM Document to manipulate. */ - private static function move_invalid_head_nodes_to_body( $head, $body ) { + private static function move_invalid_head_nodes_to_body( DOMDocument $dom ) { + $head = $dom->getElementsByTagName( 'head' )->item( 0 ); + $body = $dom->getElementsByTagName( 'body' )->item( 0 ); + // Walking backwards makes it easier to move elements in the expected order. $node = $head->lastChild; while ( $node ) { @@ -407,26 +374,25 @@ public static function restore_amp_bind_attributes( $html ) { * * @since 0.2 * - * @param string $content Valid HTML content to be represented by a DOMDocument. + * @param string $content Valid HTML content to be represented by a DOMDocument. + * @param string $encoding Optional. Encoding to use for the content. Defaults to `get_bloginfo( 'charset' )`. * * @return DOMDocument|false Returns DOMDocument, or false if conversion failed. */ - public static function get_dom_from_content( $content ) { + public static function get_dom_from_content( $content, $encoding = null ) { + // Detect encoding from the current WordPress installation. + if ( null === $encoding ) { + $encoding = get_bloginfo( 'charset' ); + } + /* * Wrap in dummy tags, since XML needs one parent node. * It also makes it easier to loop through nodes. * We can later use this to extract our nodes. - * Add utf-8 charset so loadHTML does not have problems parsing it. - * See: http://php.net/manual/en/domdocument.loadhtml.php#78243 */ - $document = sprintf( - '%s', - get_bloginfo( 'charset' ), - $content - ); - - return self::get_dom( $document ); + $document = "{$content}"; + return self::get_dom( $document, $encoding ); } /** @@ -523,16 +489,6 @@ public static function get_content_from_dom_node( $dom, $node ) { * be fixed in PHP 7.3, but for older versions of PHP the following workaround is needed. */ - /* - * First make sure meta[charset] gets http-equiv and content attributes to work around issue - * with $dom->saveHTML() erroneously encoding UTF-8 as HTML entities. - */ - $meta_charset = $xpath->query( '/html/head/meta[ @charset ]' )->item( 0 ); - if ( $meta_charset ) { - $meta_charset->setAttribute( 'http-equiv', 'Content-Type' ); - $meta_charset->setAttribute( 'content', sprintf( 'text/html; charset=%s', $meta_charset->getAttribute( 'charset' ) ) ); - } - $boundary = 'fragment_boundary:' . wp_rand(); $start_boundary = $boundary . ':start'; $end_boundary = $boundary . ':end'; @@ -546,16 +502,6 @@ public static function get_content_from_dom_node( $dom, $node ) { $dom->saveHTML() ); - // Remove meta[http-equiv] and meta[content] attributes which were added to meta[charset] for HTML serialization. - if ( $meta_charset ) { - if ( $dom->documentElement === $node ) { - $html = preg_replace( '#(#i', '$1>', $html ); - } - - $meta_charset->removeAttribute( 'http-equiv' ); - $meta_charset->removeAttribute( 'content' ); - } - $node->parentNode->removeChild( $comment_start ); $node->parentNode->removeChild( $comment_end ); } diff --git a/tests/php/test-class-amp-dom-utils.php b/tests/php/test-class-amp-dom-utils.php index 9170e7ee61d..fd9b783d7c9 100644 --- a/tests/php/test-class-amp-dom-utils.php +++ b/tests/php/test-class-amp-dom-utils.php @@ -421,9 +421,7 @@ public function test_ensuring_head_body() { $this->assertEquals( 'head', $dom->documentElement->firstChild->nodeName ); $this->assertEquals( 0, $dom->documentElement->firstChild->childNodes->length ); $this->assertEquals( 'body', $dom->documentElement->lastChild->nodeName ); - $p = $dom->getElementsByTagName( 'p' )->item( 0 ); - $this->assertEquals( $dom->documentElement->lastChild, $p->parentNode ); - $this->assertEquals( 'Hello world', $p->textContent ); + $this->assertEquals( 'Hello world', $dom->documentElement->lastChild->lastChild->textContent ); } /** From 7a5ca46255fcc3f53ee199b589a77c155d906128 Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Fri, 22 Nov 2019 18:46:50 +0100 Subject: [PATCH 19/67] Keep meta sanitizer at the end of the sanitisation run --- includes/amp-helper-functions.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/includes/amp-helper-functions.php b/includes/amp-helper-functions.php index 6c42c60f91f..e75beaca5b9 100644 --- a/includes/amp-helper-functions.php +++ b/includes/amp-helper-functions.php @@ -972,7 +972,7 @@ function amp_get_content_sanitizers( $post = null ) { } // Force style sanitizer and whitelist sanitizer to be at end. - foreach ( [ 'AMP_Style_Sanitizer', 'AMP_Tag_And_Attribute_Sanitizer' ] as $class_name ) { + foreach ( [ 'AMP_Style_Sanitizer', 'AMP_Meta_Sanitizer', 'AMP_Tag_And_Attribute_Sanitizer' ] as $class_name ) { if ( isset( $sanitizers[ $class_name ] ) ) { $sanitizer = $sanitizers[ $class_name ]; unset( $sanitizers[ $class_name ] ); From 9267e8e41e37e67f6067ad9af842ff9c357659e3 Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Fri, 22 Nov 2019 19:01:34 +0100 Subject: [PATCH 20/67] Fix sanitizer ordering test --- tests/php/test-amp-helper-functions.php | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/php/test-amp-helper-functions.php b/tests/php/test-amp-helper-functions.php index 0935e4d7bf7..edcac5b2e6d 100644 --- a/tests/php/test-amp-helper-functions.php +++ b/tests/php/test-amp-helper-functions.php @@ -739,8 +739,9 @@ static function( $classes ) { } ); $ordered_sanitizers = array_keys( amp_get_content_sanitizers() ); - $this->assertEquals( 'Even_After_Whitelist_Sanitizer', $ordered_sanitizers[ count( $ordered_sanitizers ) - 3 ] ); - $this->assertEquals( 'AMP_Style_Sanitizer', $ordered_sanitizers[ count( $ordered_sanitizers ) - 2 ] ); + $this->assertEquals( 'Even_After_Whitelist_Sanitizer', $ordered_sanitizers[ count( $ordered_sanitizers ) - 4 ] ); + $this->assertEquals( 'AMP_Style_Sanitizer', $ordered_sanitizers[ count( $ordered_sanitizers ) - 3 ] ); + $this->assertEquals( 'AMP_Meta_Sanitizer', $ordered_sanitizers[ count( $ordered_sanitizers ) - 2 ] ); $this->assertEquals( 'AMP_Tag_And_Attribute_Sanitizer', $ordered_sanitizers[ count( $ordered_sanitizers ) - 1 ] ); } From 1e50685b31054a5cec33778ef36d7f0d9518a0bc Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Fri, 22 Nov 2019 19:04:02 +0100 Subject: [PATCH 21/67] Satisfy PHPCS --- includes/utils/class-amp-dom-document.php | 27 +++++++++++++++-------- includes/utils/class-amp-dom-utils.php | 2 +- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/includes/utils/class-amp-dom-document.php b/includes/utils/class-amp-dom-document.php index c11fa8b564a..6a8c1e1cf6c 100644 --- a/includes/utils/class-amp-dom-document.php +++ b/includes/utils/class-amp-dom-document.php @@ -94,7 +94,7 @@ public function loadHTML( $source, $options = 0 ) { } // Force-add http-equiv charset to make DOMDocument behave as it should. - // See: http://php.net/manual/en/domdocument.loadhtml.php#78243 + // See: http://php.net/manual/en/domdocument.loadhtml.php#78243. $source = str_replace( '', '', @@ -105,7 +105,7 @@ public function loadHTML( $source, $options = 0 ) { if ( $success ) { $this->encoding = self::AMP_ENCODING; - $head = $this->getElementsByTagName( 'head' )->item( 0 ); + $head = $this->getElementsByTagName( 'head' )->item( 0 ); $head->removeChild( $head->firstChild ); } @@ -122,9 +122,16 @@ public function loadHTML( $source, $options = 0 ) { */ public function saveHTML( DOMNode $node = null ) { // Force-add http-equiv charset to make DOMDocument behave as it should. - // See: http://php.net/manual/en/domdocument.loadhtml.php#78243 + // See: http://php.net/manual/en/domdocument.loadhtml.php#78243. $head = $this->getElementsByTagName( 'head' )->item( 0 ); - $charset = AMP_DOM_Utils::create_node( $this, 'meta', [ 'http-equiv' => 'content-type', 'content' => 'text/html; charset=' . self::AMP_ENCODING ] ); + $charset = AMP_DOM_Utils::create_node( + $this, + 'meta', + [ + 'http-equiv' => 'content-type', + 'content' => 'text/html; charset=' . self::AMP_ENCODING, + ] + ); $head->insertBefore( $charset, $head->firstChild ); return str_replace( '', '', parent::saveHTML( $node ) ); @@ -166,14 +173,14 @@ private function adapt_encoding( $source ) { $this->original_encoding = mb_detect_encoding( $source ); } - // Guessing the encoding failed, so we assume UTF-8. + // Guessing the encoding seems to have failed, so we assume UTF-8 instead. if ( empty( $this->original_encoding ) ) { $this->original_encoding = self::AMP_ENCODING; } $this->original_encoding = $this->sanitize_encoding( $this->original_encoding ); - $target = $source; //mb_convert_encoding( $source, self::AMP_ENCODING, $this->original_encoding ); + $target = mb_convert_encoding( $source, self::AMP_ENCODING, $this->original_encoding ); return is_string( $target ) ? $target : $source; } @@ -188,12 +195,14 @@ private function detect_and_strip_encoding( &$content ) { $encoding = self::UNKNOWN_ENCODING; // Check for HTML 4 http-equiv meta tags. - if ( $http_equiv_tag = $this->find_tag( $content, 'meta', 'http-equiv' ) ) { + $http_equiv_tag = $this->find_tag( $content, 'meta', 'http-equiv' ); + if ( $http_equiv_tag ) { $encoding = $this->extract_value( $http_equiv_tag, 'charset' ); } // Check for HTML 5 charset meta tag. This overrides the HTML 4 charset. - if ( $charset_tag = $this->find_tag( $content, 'meta', 'charset' ) ) { + $charset_tag = $this->find_tag( $content, 'meta', 'charset' ); + if ( $charset_tag ) { $encoding = $this->extract_value( $charset_tag, 'charset' ); } @@ -298,7 +307,7 @@ public function __get( $name ) { } // Mimic regular PHP behavior for missing notices. - trigger_error( "Undefined property: AMP_DOM_Document::${$name}", E_NOTICE ); + trigger_error( "Undefined property: AMP_DOM_Document::${$name}", E_NOTICE ); // phpcs:ignore WordPress.PHP.DevelopmentFunctions,WordPress.Security.EscapeOutput return null; } } diff --git a/includes/utils/class-amp-dom-utils.php b/includes/utils/class-amp-dom-utils.php index cc96d1f7234..b53eae69305 100644 --- a/includes/utils/class-amp-dom-utils.php +++ b/includes/utils/class-amp-dom-utils.php @@ -170,7 +170,7 @@ static function( $noscript_matches ) { * Apparently PHP's DOM is more lenient when parsing HTML to allow nodes in the HEAD which do not belong. A proper * HTML5 parser should rather prematurely short-circuit the HEAD when it finds an illegal element. * - * @param DOMDocument DOM Document to manipulate. + * @param DOMDocument $dom DOM Document to manipulate. */ private static function move_invalid_head_nodes_to_body( DOMDocument $dom ) { $head = $dom->getElementsByTagName( 'head' )->item( 0 ); From ed93ad64d4d2efe7863aef522fe8171ff5f5fcd8 Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Mon, 25 Nov 2019 17:03:01 +0100 Subject: [PATCH 22/67] Wrap mb_* in function_exists() --- includes/utils/class-amp-dom-document.php | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/includes/utils/class-amp-dom-document.php b/includes/utils/class-amp-dom-document.php index 6a8c1e1cf6c..fc18f9e2b50 100644 --- a/includes/utils/class-amp-dom-document.php +++ b/includes/utils/class-amp-dom-document.php @@ -169,7 +169,7 @@ private function maybe_add_head_or_body( $content ) { */ private function adapt_encoding( $source ) { // No encoding was provided, so we need to guess. - if ( self::UNKNOWN_ENCODING === $this->original_encoding ) { + if ( self::UNKNOWN_ENCODING === $this->original_encoding && function_exists( 'mb_detect_encoding' ) ) { $this->original_encoding = mb_detect_encoding( $source ); } @@ -180,7 +180,7 @@ private function adapt_encoding( $source ) { $this->original_encoding = $this->sanitize_encoding( $this->original_encoding ); - $target = mb_convert_encoding( $source, self::AMP_ENCODING, $this->original_encoding ); + $target = function_exists( 'mb_convert_encoding' ) ? mb_convert_encoding( $source, self::AMP_ENCODING, $this->original_encoding ) : false; return is_string( $target ) ? $target : $source; } @@ -276,6 +276,10 @@ private function extract_value( $tag, $attribute ) { * @return string Sanitized encoding. Falls back to 'auto' on failure. */ private function sanitize_encoding( $encoding ) { + if ( ! function_exists( 'mb_list_encodings' ) ) { + return $encoding; + } + static $known_encodings = null; if ( null === $known_encodings ) { @@ -287,7 +291,7 @@ private function sanitize_encoding( $encoding ) { } if ( ! in_array( $encoding, $known_encodings, true ) ) { - return 'auto'; + return self::UNKNOWN_ENCODING; } return $encoding; From c98e7e5d32cfeb92d2e6378257c38fe404d2680d Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Mon, 25 Nov 2019 17:03:57 +0100 Subject: [PATCH 23/67] Fix DOMXPath type case mismatch --- includes/utils/class-amp-dom-document.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/includes/utils/class-amp-dom-document.php b/includes/utils/class-amp-dom-document.php index fc18f9e2b50..cbf457545d7 100644 --- a/includes/utils/class-amp-dom-document.php +++ b/includes/utils/class-amp-dom-document.php @@ -10,7 +10,7 @@ * * @since 1.5 * - * @property DOMXpath $xpath XPath query object for this document. + * @property DOMXPath $xpath XPath query object for this document. * * Abstract away some of the difficulties of working with PHP's DOMDocument. */ From 51a20020f58c6c176c7b09f727a5b1f9c709a4e6 Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Mon, 25 Nov 2019 17:08:13 +0100 Subject: [PATCH 24/67] Remove useless setting of encoding property --- includes/utils/class-amp-dom-document.php | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/includes/utils/class-amp-dom-document.php b/includes/utils/class-amp-dom-document.php index cbf457545d7..5eded642913 100644 --- a/includes/utils/class-amp-dom-document.php +++ b/includes/utils/class-amp-dom-document.php @@ -104,8 +104,7 @@ public function loadHTML( $source, $options = 0 ) { $success = parent::loadHTML( $source, $options ); if ( $success ) { - $this->encoding = self::AMP_ENCODING; - $head = $this->getElementsByTagName( 'head' )->item( 0 ); + $head = $this->getElementsByTagName( 'head' )->item( 0 ); $head->removeChild( $head->firstChild ); } From 55b98776ecc3124ef868a54e5610a17e9bb23420 Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Mon, 25 Nov 2019 17:12:09 +0100 Subject: [PATCH 25/67] Add safeguard around removing http-equiv charset --- includes/utils/class-amp-dom-document.php | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/includes/utils/class-amp-dom-document.php b/includes/utils/class-amp-dom-document.php index 5eded642913..b1eff15f8e1 100644 --- a/includes/utils/class-amp-dom-document.php +++ b/includes/utils/class-amp-dom-document.php @@ -104,8 +104,12 @@ public function loadHTML( $source, $options = 0 ) { $success = parent::loadHTML( $source, $options ); if ( $success ) { + // Remove http-equiv charset again. $head = $this->getElementsByTagName( 'head' )->item( 0 ); - $head->removeChild( $head->firstChild ); + $meta = $head->firstChild; + if ( 'meta' === $meta->tagName && 'content-type' === $meta->getAttribute( 'http-equiv' ) && ( 'text/html; charset=' . self::AMP_ENCODING ) === $meta->getAttribute( 'content' ) ) { + $head->removeChild( $meta ); + } } return $success; From 2bdf1b916273fc55167f00f93d20ff8dcdaae499 Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Mon, 25 Nov 2019 20:19:47 +0100 Subject: [PATCH 26/67] Adapt menu toggle test to ignore head --- ...st-class-amp-nav-menu-toggle-sanitizer.php | 25 +++++++++---------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/tests/php/test-class-amp-nav-menu-toggle-sanitizer.php b/tests/php/test-class-amp-nav-menu-toggle-sanitizer.php index 5324f57d0dd..e0867ba0a46 100644 --- a/tests/php/test-class-amp-nav-menu-toggle-sanitizer.php +++ b/tests/php/test-class-amp-nav-menu-toggle-sanitizer.php @@ -22,7 +22,6 @@ public function data_converter() { $container_id = 'nav-menu-container'; $toggle_id = 'nav-menu-toggle'; - $head = sprintf( '', get_bloginfo( 'charset' ) ); $container = ''; $toggle = ''; @@ -39,8 +38,8 @@ public function data_converter() { return [ 'container_before_toggle' => [ - '' . $head . '' . $container . $toggle . '', - '' . $head . '' . $amp_state . str_replace( '>', $amp_get_container_attrs( 'nav-menu-wrapper' ) . '>', $container ) . str_replace( '>Toggle', $amp_get_toggle_attrs() . '>Toggle', $toggle ) . '', + '' . $container . $toggle . '', + '' . $amp_state . str_replace( '>', $amp_get_container_attrs( 'nav-menu-wrapper' ) . '>', $container ) . str_replace( '>Toggle', $amp_get_toggle_attrs() . '>Toggle', $toggle ) . '', [ 'nav_container_id' => $container_id, 'menu_button_id' => $toggle_id, @@ -49,8 +48,8 @@ public function data_converter() { ], ], 'toggle_before_container' => [ - '' . $head . '' . $toggle . $container . '', - '' . $head . '' . str_replace( '>Toggle', $amp_get_toggle_attrs() . '>Toggle', $toggle ) . $amp_state . str_replace( '>', $amp_get_container_attrs( 'nav-menu-wrapper' ) . '>', $container ) . '', + '' . $toggle . $container . '', + '' . str_replace( '>Toggle', $amp_get_toggle_attrs() . '>Toggle', $toggle ) . $amp_state . str_replace( '>', $amp_get_container_attrs( 'nav-menu-wrapper' ) . '>', $container ) . '', [ 'nav_container_id' => $container_id, 'menu_button_id' => $toggle_id, @@ -59,8 +58,8 @@ public function data_converter() { ], ], 'container_is_body' => [ - '' . $head . '' . $container . $toggle . '', - '' . $head . '' . $amp_state . $container . str_replace( '>Toggle', $amp_get_toggle_attrs( '', '' ) . '>Toggle', $toggle ) . '', + '' . $container . $toggle . '', + '' . $amp_state . $container . str_replace( '>Toggle', $amp_get_toggle_attrs( '', '' ) . '>Toggle', $toggle ) . '', [ 'nav_container_xpath' => '//body', 'menu_button_id' => $toggle_id, @@ -68,8 +67,8 @@ public function data_converter() { ], ], 'container_is_html' => [ - '' . $head . '' . $container . $toggle . '', - '' . $head . '' . $amp_state . $container . str_replace( '>Toggle', $amp_get_toggle_attrs( '', '' ) . '>Toggle', $toggle ) . '', + '' . $container . $toggle . '', + '' . $amp_state . $container . str_replace( '>Toggle', $amp_get_toggle_attrs( '', '' ) . '>Toggle', $toggle ) . '', [ 'nav_container_xpath' => '//html', 'menu_button_id' => $toggle_id, @@ -77,16 +76,16 @@ public function data_converter() { ], ], 'no_container_provided' => [ - '' . $head . '' . $container . $toggle . '', - '' . $head . '' . $container . '', + '' . $container . $toggle . '', + '' . $container . '', [ 'menu_button_id' => $toggle_id, 'nav_container_toggle_class' => 'toggled-on', ], ], 'no_arguments_provided' => [ - '' . $head . '' . $container . $toggle . '', - '' . $head . '' . $container . $toggle . '', + '' . $container . $toggle . '', + '' . $container . $toggle . '', [], ], ]; From cf3447715b32aafd1ebdbbe30ca3afe55bfe38c4 Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Mon, 25 Nov 2019 20:20:01 +0100 Subject: [PATCH 27/67] Harden document structure parsing --- includes/utils/class-amp-dom-document.php | 94 ++++++++++++++++---- tests/php/test-class-amp-dom-document.php | 101 ++++++++++++++++++++++ 2 files changed, 176 insertions(+), 19 deletions(-) create mode 100644 tests/php/test-class-amp-dom-document.php diff --git a/includes/utils/class-amp-dom-document.php b/includes/utils/class-amp-dom-document.php index b1eff15f8e1..be1f3111747 100644 --- a/includes/utils/class-amp-dom-document.php +++ b/includes/utils/class-amp-dom-document.php @@ -36,6 +36,11 @@ final class AMP_DOM_Document extends DOMDocument { */ const CHARSET_META_TAG_PATTERN = '/]*?\s*charset=[^>]*?>[^<]*(?:<\/meta>)?/i'; + /** + * Regular expression pattern to match the main HTML structural tags. + */ + const HTML_STRUCTURE_PATTERN = '/(?:.*?(?]*)?>))?(?:.*?(?]*)?>)(?:.*?(?]*)?>).*?(?<\/head\s*>))?(?:.*?(?]*)?>).*?(?<\/body\s*>))?.*?(?<\/html\s*>))?/is'; + /** * ID of the hacky charset we need to add to make loadHTML() behave. */ @@ -95,9 +100,9 @@ public function loadHTML( $source, $options = 0 ) { // Force-add http-equiv charset to make DOMDocument behave as it should. // See: http://php.net/manual/en/domdocument.loadhtml.php#78243. - $source = str_replace( - '', - '', + $source = preg_replace( + '/]*)?>/i', + '$1', $source ); @@ -137,30 +142,76 @@ public function saveHTML( DOMNode $node = null ) { ); $head->insertBefore( $charset, $head->firstChild ); - return str_replace( '', '', parent::saveHTML( $node ) ); + return preg_replace( + sprintf( + '##i', + preg_quote( self::AMP_ENCODING, '#' ) + ), + '', + parent::saveHTML( $node ) + ); } /** - * Maybe add the and/or tag(s). + * Maybe add the , and/or tag(s). * * @param string $content Content to add the head or body tags to. * @return string Adapted content. */ private function maybe_add_head_or_body( $content ) { - $substring = substr( $content, 0, 5000 ); - if ( false === strpos( $substring, '' ) ) { - // Create the required HTML structure if none exists yet. - $content = "{$content}"; - } else { - // seems to be present without . - $content = preg_replace( '#(.*)#', '$1', $content ); - } - } elseif ( false === strpos( $substring, '' ) ) { - // Create a element if none exists yet. - $content = str_replace( '$1' + . ( empty( $matches['html_end'] ) ? '' : $matches['html_end'] ), + $content + ); + } elseif ( empty( $matches['body_start'] ) && ! empty( $matches['head_start'] ) ) { + // Head without body, so wrap content in body. + $pattern = sprintf( + '/%s(.*)%s/i', + preg_quote( $matches['head_end'], '/' ), + ( empty( $matches['html_end'] ) ? '' : preg_quote( $matches['html_end'], '/' ) ) + ); + $content = preg_replace( + $pattern, + $matches['head_end'] + . '$1' + . ( empty( $matches['html_end'] ) ? '' : $matches['html_end'] ), + $content + ); + } elseif ( empty( $matches['head_start'] ) && ! empty( $matches['body_start'] ) ) { + // Body without head, so add empty head before body. + $content = str_replace( $matches['body_start'], '' . $matches['body_start'], $content ); + } + + if ( empty( $matches['html_start'] ) ) { + // No surround html tag, so wrap the content in html. + $content = "{$content}"; + } + + // Re-add AMP default doctype. + $content = "{$content}"; + return $content; } @@ -211,8 +262,13 @@ private function detect_and_strip_encoding( &$content ) { // Strip charset tags if they don't fit the AMP UTF-8 requirement. if ( self::AMP_ENCODING !== strtolower( $encoding ) ) { - $http_equiv_tag && str_replace( $http_equiv_tag, '', $content ); - $charset_tag && str_replace( $charset_tag, '', $content ); + if ( $http_equiv_tag ) { + $content = str_replace( $http_equiv_tag, '', $content ); + } + + if ( $charset_tag ) { + $content = str_replace( $charset_tag, '', $content ); + } } return $encoding; diff --git a/tests/php/test-class-amp-dom-document.php b/tests/php/test-class-amp-dom-document.php new file mode 100644 index 00000000000..7933ed74f7b --- /dev/null +++ b/tests/php/test-class-amp-dom-document.php @@ -0,0 +1,101 @@ + [ + 'utf-8', + '', + '', + ], + 'valid_document_with_attributes' => [ + 'utf-8', + '

Text

', + '

Text

', + ], + 'missing_head' => [ + 'utf-8', + '

Text

', + '

Text

', + ], + 'missing_body' => [ + 'utf-8', + '

Text

', + '

Text

', + ], + 'missing_head_and_body' => [ + 'utf-8', + '

Text

', + '

Text

', + ], + // @todo This one is still broken. + /*'content_only' => [ + 'utf-8', + '

Text

', + '

Text

', + ],*/ + 'missing_doctype' => [ + 'utf-8', + '

Text

', + '

Text

', + ], + 'html-4_doctype' => [ + 'utf-8', + '

Text

', + '

Text

', + ], + ]; + } + + /** + * Tests loading and saving the content via AMP_DOM_Document. + * + * @param string $charset Charset to use. + * @param string $source Source content. + * @param string $expected Expected target content. + * + * @dataProvider data_dom_document + * @covers AMP_DOM_Document::loadHTML() + * @covers AMP_DOM_Document::saveHTML() + */ + public function test_dom_document( $charset, $source, $expected ) { + $document = new AMP_DOM_Document( '', $charset ); + $document->loadHTML( $source ); + + $this->assertEqualMarkup( $expected, $document->saveHTML() ); + } + + /** + * Assert markup is equal. + * + * @param string $expected Expected markup. + * @param string $actual Actual markup. + */ + public function assertEqualMarkup( $expected, $actual ) { + $actual = preg_replace( '/\s+/', ' ', $actual ); + $expected = preg_replace( '/\s+/', ' ', $expected ); + $actual = preg_replace( '/(?<=>)\s+(?=<)/', '', trim( $actual ) ); + $expected = preg_replace( '/(?<=>)\s+(?=<)/', '', trim( $expected ) ); + + $this->assertEquals( + array_filter( preg_split( '#(<[^>]+>|[^<>]+)#', $expected, -1, PREG_SPLIT_DELIM_CAPTURE ) ), + array_filter( preg_split( '#(<[^>]+>|[^<>]+)#', $actual, -1, PREG_SPLIT_DELIM_CAPTURE ) ) + ); + } +} From a9f1a042092f419cf2df2284465427b6512ac068 Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Wed, 27 Nov 2019 17:46:42 +0100 Subject: [PATCH 28/67] Converting automatically to UTF-8 seems to work --- includes/utils/class-amp-dom-document.php | 103 ++++++++++++------ tests/php/test-class-amp-dom-document.php | 85 +++++++++++++-- ...st-class-amp-nav-menu-toggle-sanitizer.php | 24 ++-- 3 files changed, 159 insertions(+), 53 deletions(-) diff --git a/includes/utils/class-amp-dom-document.php b/includes/utils/class-amp-dom-document.php index be1f3111747..3aa03912e02 100644 --- a/includes/utils/class-amp-dom-document.php +++ b/includes/utils/class-amp-dom-document.php @@ -23,9 +23,20 @@ final class AMP_DOM_Document extends DOMDocument { /** * Encoding identifier to use for an unknown encoding. + * + * "auto" is recognized by mb_convert_encoding() as a special value. */ const UNKNOWN_ENCODING = 'auto'; + /** + * Encoding detection order in case we have to guess. + * + * This list of encoding detection order is just a wild guess and might need fine-tuning over time. + * If the charset was not provided explicitly, we can really only guess, as the detection can + * never be 100% accurate and reliable. + */ + const ENCODING_DETECTION_ORDER = 'UTF-8, EUC-JP, eucJP-win, JIS, ISO-2022-JP, ISO-8859-15, ISO-8859-1, ASCII'; + /** * Regular expression pattern to match the http-equiv meta tag. */ @@ -39,7 +50,7 @@ final class AMP_DOM_Document extends DOMDocument { /** * Regular expression pattern to match the main HTML structural tags. */ - const HTML_STRUCTURE_PATTERN = '/(?:.*?(?]*)?>))?(?:.*?(?]*)?>)(?:.*?(?]*)?>).*?(?<\/head\s*>))?(?:.*?(?]*)?>).*?(?<\/body\s*>))?.*?(?<\/html\s*>))?/is'; + const HTML_STRUCTURE_PATTERN = '/(?:.*?(?]*)?>))?(?:(?.*?)(?]*)?>))?(?:.*?(?]*)?>.*?<\/head\s*>))?(?:.*?(?]*)?>.*?<\/body\s*>))?.*?(?:(?:.*(?<\/html\s*>)|.*)(?.*))/is'; /** * ID of the hacky charset we need to add to make loadHTML() behave. @@ -64,7 +75,8 @@ final class AMP_DOM_Document extends DOMDocument { * @var string[] */ private $encoding_map = [ - 'latin-1' => 'iso-8859-1', + // Assume ISO-8859-1 for some charsets. + 'latin-1' => 'ISO-8859-1', ]; /** @@ -90,7 +102,7 @@ public function __construct( $version = '', $encoding = null ) { * @return bool true on success or false on failure. */ public function loadHTML( $source, $options = 0 ) { - $source = $this->maybe_add_head_or_body( $source ); + $source = $this->normalize_document_structure( $source ); $this->original_encoding = $this->detect_and_strip_encoding( $source ); @@ -103,7 +115,8 @@ public function loadHTML( $source, $options = 0 ) { $source = preg_replace( '/]*)?>/i', '$1', - $source + $source, + 1 ); $success = parent::loadHTML( $source, $options ); @@ -148,60 +161,83 @@ public function saveHTML( DOMNode $node = null ) { preg_quote( self::AMP_ENCODING, '#' ) ), '', - parent::saveHTML( $node ) + parent::saveHTML( $node ), + 1 ); } /** - * Maybe add the , and/or tag(s). + * Normalize the document structure. * - * @param string $content Content to add the head or body tags to. - * @return string Adapted content. + * This makes sure the document adheres to the general structure that AMP requires: + * ``` + * + * + * + * + * + * + * + * + * ``` + * + * @param string $content Content to normalize the structure of. + * @return string Normalized content. */ - private function maybe_add_head_or_body( $content ) { + private function normalize_document_structure( $content ) { $matches = []; + // Unable to parse, so skip normalization and hope for the best. if ( false === preg_match( self::HTML_STRUCTURE_PATTERN, $content, $matches ) ) { return $content; } + // Strip doctype for now. if ( ! empty( $matches['doctype'] ) ) { - // Strip existing doctype. - $content = str_replace( $matches['doctype'], '', $content ); + $content = preg_replace( + sprintf( + '/^.*?%s/s', + str_replace( "\n", '\R', preg_quote( $matches['doctype'], '/' ) ) + ), + '', + $content, + 1 + ); } - if ( empty( $matches['head_start'] ) && empty( $matches['body_start'] ) ) { - var_dump( $matches ); + if ( empty( $matches['head'] ) && empty( $matches['body'] ) ) { // Neither body, nor head, so wrap content in both. $pattern = sprintf( - '/%s(.*)%s/i', - ( empty( $matches['html_start'] ) ? '' : preg_quote( $matches['html_start'], '/' ) ), - ( empty( $matches['html_end'] ) ? '' : preg_quote( $matches['html_end'], '/' ) ) + '/%s(.*)%s/is', + ( empty( $matches['html_start'] ) ? '^\s*' : preg_quote( $matches['html_start'], '/' ) ), + ( empty( $matches['html_end'] ) ? '$\s*' : preg_quote( $matches['html_end'], '/' ) ) ); $content = preg_replace( $pattern, ( empty( $matches['html_start'] ) ? '' : $matches['html_start'] ) . '$1' . ( empty( $matches['html_end'] ) ? '' : $matches['html_end'] ), - $content + $content, + 1 ); - } elseif ( empty( $matches['body_start'] ) && ! empty( $matches['head_start'] ) ) { + } elseif ( empty( $matches['body'] ) && ! empty( $matches['head'] ) ) { // Head without body, so wrap content in body. $pattern = sprintf( - '/%s(.*)%s/i', - preg_quote( $matches['head_end'], '/' ), + '/%s(.*)%s/is', + preg_quote( $matches['head'], '/' ), ( empty( $matches['html_end'] ) ? '' : preg_quote( $matches['html_end'], '/' ) ) ); $content = preg_replace( $pattern, - $matches['head_end'] + $matches['head'] . '$1' . ( empty( $matches['html_end'] ) ? '' : $matches['html_end'] ), - $content + $content, + 1 ); - } elseif ( empty( $matches['head_start'] ) && ! empty( $matches['body_start'] ) ) { + } elseif ( empty( $matches['head'] ) && ! empty( $matches['body'] ) ) { // Body without head, so add empty head before body. - $content = str_replace( $matches['body_start'], '' . $matches['body_start'], $content ); + $content = str_replace( $matches['body'], '' . $matches['body'], $content ); } if ( empty( $matches['html_start'] ) ) { @@ -209,7 +245,7 @@ private function maybe_add_head_or_body( $content ) { $content = "{$content}"; } - // Re-add AMP default doctype. + // Reinsert a standard doctype. $content = "{$content}"; return $content; @@ -224,7 +260,7 @@ private function maybe_add_head_or_body( $content ) { private function adapt_encoding( $source ) { // No encoding was provided, so we need to guess. if ( self::UNKNOWN_ENCODING === $this->original_encoding && function_exists( 'mb_detect_encoding' ) ) { - $this->original_encoding = mb_detect_encoding( $source ); + $this->original_encoding = mb_detect_encoding( $source, self::ENCODING_DETECTION_ORDER, true ); } // Guessing the encoding seems to have failed, so we assume UTF-8 instead. @@ -234,9 +270,14 @@ private function adapt_encoding( $source ) { $this->original_encoding = $this->sanitize_encoding( $this->original_encoding ); - $target = function_exists( 'mb_convert_encoding' ) ? mb_convert_encoding( $source, self::AMP_ENCODING, $this->original_encoding ) : false; + $target = false; + if ( self::AMP_ENCODING !== strtolower( $this->original_encoding ) ) { + $target = function_exists( 'mb_convert_encoding' ) + ? mb_convert_encoding( $source, self::AMP_ENCODING, $this->original_encoding ) + : false; + } - return is_string( $target ) ? $target : $source; + return false !== $target ? $target : $source; } /** @@ -345,11 +386,11 @@ private function sanitize_encoding( $encoding ) { $known_encodings = array_map( 'strtolower', mb_list_encodings() ); } - if ( array_key_exists( $encoding, $this->encoding_map ) ) { - $encoding = $this->encoding_map[ $encoding ]; + if ( array_key_exists( strtolower( $encoding ), $this->encoding_map ) ) { + $encoding = $this->encoding_map[ strtolower( $encoding ) ]; } - if ( ! in_array( $encoding, $known_encodings, true ) ) { + if ( ! in_array( strtolower( $encoding ), $known_encodings, true ) ) { return self::UNKNOWN_ENCODING; } diff --git a/tests/php/test-class-amp-dom-document.php b/tests/php/test-class-amp-dom-document.php index 7933ed74f7b..655730625e8 100644 --- a/tests/php/test-class-amp-dom-document.php +++ b/tests/php/test-class-amp-dom-document.php @@ -19,47 +19,112 @@ class Test_AMP_DOM_Document extends WP_UnitTestCase { */ public function data_dom_document() { return [ - 'minimum_valid_document' => [ + 'minimum_valid_document' => [ 'utf-8', '', '', ], - 'valid_document_with_attributes' => [ + 'valid_document_with_attributes' => [ 'utf-8', '

Text

', '

Text

', ], - 'missing_head' => [ + 'missing_head' => [ 'utf-8', '

Text

', '

Text

', ], - 'missing_body' => [ + 'missing_body' => [ 'utf-8', '

Text

', '

Text

', ], - 'missing_head_and_body' => [ + 'missing_head_and_body' => [ 'utf-8', '

Text

', '

Text

', ], - // @todo This one is still broken. - /*'content_only' => [ + 'missing_html_and_head_and_body' => [ + 'utf-8', + '

Text

', + '

Text

', + ], + 'content_only' => [ 'utf-8', '

Text

', '

Text

', - ],*/ - 'missing_doctype' => [ + ], + 'missing_doctype' => [ 'utf-8', '

Text

', '

Text

', ], - 'html-4_doctype' => [ + 'html_4_doctype' => [ 'utf-8', '

Text

', '

Text

', ], + 'lots_of_whitespace' => [ + 'utf-8', + " \n \n \n \n \n \n \n

\n Text \n

\n\n \n \n ", + '

Text

', + ], + 'utf_8_encoding_predefined' => [ + 'utf-8', + '

مرحبا بالعالم! Check out ‘this’ and “that” and—other things.

', + '

مرحبا بالعالم! Check out ‘this’ and “that” and—other things.

', + ], + 'utf_8_encoding_guessed_via_charset' => [ + '', + '

مرحبا بالعالم! Check out ‘this’ and “that” and—other things.

', + '

مرحبا بالعالم! Check out ‘this’ and “that” and—other things.

', + ], + 'utf_8_encoding_guessed_via_content' => [ + '', + '

مرحبا بالعالم! Check out ‘this’ and “that” and—other things.

', + '

مرحبا بالعالم! Check out ‘this’ and “that” and—other things.

', + ], + 'iso_8859_1_encoding_predefined' => [ + 'iso-8859-1', + utf8_decode( '

ÄÖÜ

' ), + '

ÄÖÜ

', + ], + 'iso_8859_1_encoding_guessed_via_charset' => [ + '', + utf8_decode( '

ÄÖÜ

' ), + '

ÄÖÜ

', + ], + 'iso_8859_1_encoding_guessed_via_content' => [ + '', + utf8_decode( '

ÄÖÜ

' ), + '

ÄÖÜ

', + ], + 'raw_iso_8859_1' => [ + '', + utf8_decode( 'ÄÖÜ' ), + 'ÄÖÜ', + ], + // Make sure we correctly identify the ISO-8859 sub-charsets ("€" does not exist in ISO-8859-1). + 'iso_8859_15_encoding_predefined' => [ + 'iso-8859-1', + mb_convert_encoding( '

', 'ISO-8859-15', 'UTF-8' ), + '

', + ], + 'iso_8859_15_encoding_guessed_via_charset' => [ + '', + mb_convert_encoding( '

', 'ISO-8859-15', 'UTF-8' ), + '

', + ], + 'iso_8859_15_encoding_guessed_via_content' => [ + '', + mb_convert_encoding( '

', 'ISO-8859-15', 'UTF-8' ), + '

', + ], + 'raw_iso_8859_15' => [ + '', + mb_convert_encoding( '€', 'ISO-8859-15', 'UTF-8' ), + '€', + ], ]; } diff --git a/tests/php/test-class-amp-nav-menu-toggle-sanitizer.php b/tests/php/test-class-amp-nav-menu-toggle-sanitizer.php index e0867ba0a46..be25ff21952 100644 --- a/tests/php/test-class-amp-nav-menu-toggle-sanitizer.php +++ b/tests/php/test-class-amp-nav-menu-toggle-sanitizer.php @@ -38,8 +38,8 @@ public function data_converter() { return [ 'container_before_toggle' => [ - '' . $container . $toggle . '', - '' . $amp_state . str_replace( '>', $amp_get_container_attrs( 'nav-menu-wrapper' ) . '>', $container ) . str_replace( '>Toggle', $amp_get_toggle_attrs() . '>Toggle', $toggle ) . '', + '' . $container . $toggle . '', + '' . $amp_state . str_replace( '>', $amp_get_container_attrs( 'nav-menu-wrapper' ) . '>', $container ) . str_replace( '>Toggle', $amp_get_toggle_attrs() . '>Toggle', $toggle ) . '', [ 'nav_container_id' => $container_id, 'menu_button_id' => $toggle_id, @@ -48,8 +48,8 @@ public function data_converter() { ], ], 'toggle_before_container' => [ - '' . $toggle . $container . '', - '' . str_replace( '>Toggle', $amp_get_toggle_attrs() . '>Toggle', $toggle ) . $amp_state . str_replace( '>', $amp_get_container_attrs( 'nav-menu-wrapper' ) . '>', $container ) . '', + '' . $toggle . $container . '', + '' . str_replace( '>Toggle', $amp_get_toggle_attrs() . '>Toggle', $toggle ) . $amp_state . str_replace( '>', $amp_get_container_attrs( 'nav-menu-wrapper' ) . '>', $container ) . '', [ 'nav_container_id' => $container_id, 'menu_button_id' => $toggle_id, @@ -58,8 +58,8 @@ public function data_converter() { ], ], 'container_is_body' => [ - '' . $container . $toggle . '', - '' . $amp_state . $container . str_replace( '>Toggle', $amp_get_toggle_attrs( '', '' ) . '>Toggle', $toggle ) . '', + '' . $container . $toggle . '', + '' . $amp_state . $container . str_replace( '>Toggle', $amp_get_toggle_attrs( '', '' ) . '>Toggle', $toggle ) . '', [ 'nav_container_xpath' => '//body', 'menu_button_id' => $toggle_id, @@ -67,8 +67,8 @@ public function data_converter() { ], ], 'container_is_html' => [ - '' . $container . $toggle . '', - '' . $amp_state . $container . str_replace( '>Toggle', $amp_get_toggle_attrs( '', '' ) . '>Toggle', $toggle ) . '', + '' . $container . $toggle . '', + '' . $amp_state . $container . str_replace( '>Toggle', $amp_get_toggle_attrs( '', '' ) . '>Toggle', $toggle ) . '', [ 'nav_container_xpath' => '//html', 'menu_button_id' => $toggle_id, @@ -76,16 +76,16 @@ public function data_converter() { ], ], 'no_container_provided' => [ - '' . $container . $toggle . '', - '' . $container . '', + '' . $container . $toggle . '', + '' . $container . '', [ 'menu_button_id' => $toggle_id, 'nav_container_toggle_class' => 'toggled-on', ], ], 'no_arguments_provided' => [ - '' . $container . $toggle . '', - '' . $container . $toggle . '', + '' . $container . $toggle . '', + '' . $container . $toggle . '', [], ], ]; From fdd72ccfb272e9419c260ae7bcf50b3c90e95489 Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Wed, 27 Nov 2019 18:06:48 +0100 Subject: [PATCH 29/67] Simplify meta sanitizer --- .../sanitizers/class-amp-meta-sanitizer.php | 97 ++----------------- 1 file changed, 6 insertions(+), 91 deletions(-) diff --git a/includes/sanitizers/class-amp-meta-sanitizer.php b/includes/sanitizers/class-amp-meta-sanitizer.php index 256273a564e..6d61d7c8b3b 100644 --- a/includes/sanitizers/class-amp-meta-sanitizer.php +++ b/includes/sanitizers/class-amp-meta-sanitizer.php @@ -30,13 +30,6 @@ class AMP_Meta_Sanitizer extends AMP_Base_Sanitizer { 'use_document_element' => true, // We want to work on the header, so we need the entire document. ]; - /** - * Reference to the shared XPath object to query the DOM. - * - * @var DOMXPath - */ - protected $xpath; - /** * The document's element. * @@ -91,9 +84,6 @@ class AMP_Meta_Sanitizer extends AMP_Base_Sanitizer { * Sanitize. */ public function sanitize() { - $this->xpath = new DOMXPath( $this->dom ); - $this->head = $this->ensure_head_is_present(); - $charset = $this->detect_charset(); $elements = $this->dom->getElementsByTagName( static::$tag ); // Remove all nodes for easy reordering later on. @@ -121,12 +111,7 @@ static function ( $element ) { } } - $this->ensure_charset_is_present( $charset ); - - if ( ! $this->is_correct_charset() ) { // phpcs:ignore Generic.CodeAnalysis.EmptyStatement - // @TODO Re-encode the content into UTF-8. - // ... sure? - } + $this->ensure_charset_is_present(); $this->ensure_viewport_is_present(); @@ -135,71 +120,18 @@ static function ( $element ) { $this->re_add_meta_tags_in_optimized_order(); } - /** - * Ensure that the element is present in the document. - * - * @return DOMElement The document's element. - */ - protected function ensure_head_is_present() { - $head = $this->dom->getElementsByTagName( 'head' )->item( 0 ); - - if ( ! $head ) { - $head = $this->dom->createElement( 'head' ); - $head = $this->dom->documentElement->insertBefore( $head, $this->dom->documentElement->firstChild ); - } - - return $head; - } - - /** - * Detect the charset of the document. - * - * @return string|false Detected charset of the document, or false if none. - */ - protected function detect_charset() { - $charset = false; - - // Check for HTML 4 http-equiv meta tags. - $http_equiv_tag = $this->xpath->query( '//meta[ translate(@http-equiv, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz") = "content-type" and @content ]' )->item( 0 ); - if ( $http_equiv_tag ) { - $http_equiv_tag->parentNode->removeChild( $http_equiv_tag ); - - // Check for the existence of a proper charset attribute first. - $charset = $http_equiv_tag->getAttribute( 'charset' ); - if ( ! $charset ) { - // If not, check whether the charset is included with the content type, and use that. - $content = $http_equiv_tag->getAttribute( 'content' ); - - $matches = []; - if ( preg_match( '/;\s*charset=(?[^;]+)/', $content, $matches ) && ! empty( $matches['charset'] ) ) { - $charset = trim( $matches['charset'] ); - } - } - } - - // Check for HTML 5 charset meta tag. This overrides the HTML 4 charset. - $charset_tag = $this->xpath->query( '//meta[ @charset ]' )->item( 0 ); - if ( $charset_tag ) { - $charset_tag = $charset_tag->parentNode->removeChild( $charset_tag ); - $charset = $charset_tag->getAttribute( 'charset' ); - } - - return $charset; - } /** * Always ensure that we have an HTML 5 charset meta tag. * - * The charset defaults to utf-8, which is also what AMP requires. - * - * @param string|false $charset Optional. Charset that was already detected. False if none. Defaults to false. + * The charset is set to utf-8, which is what AMP requires. */ - protected function ensure_charset_is_present( $charset = false ) { + protected function ensure_charset_is_present() { if ( ! empty( $this->meta_tags[ self::TAG_CHARSET ] ) ) { return; } - $this->meta_tags[ self::TAG_CHARSET ][] = $this->create_charset_element( $charset ?: static::AMP_CHARSET ); + $this->meta_tags[ self::TAG_CHARSET ][] = $this->create_charset_element(); } /** @@ -262,15 +194,14 @@ protected function process_amp_script_meta_tags() { /** * Create a new meta tag for the charset value. * - * @param string $charset Character set to use. * @return DOMElement New meta tag with requested charset. */ - protected function create_charset_element( $charset ) { + protected function create_charset_element() { return AMP_DOM_Utils::create_node( $this->dom, 'meta', [ - 'charset' => strtolower( $charset ), + 'charset' => self::AMP_CHARSET, ] ); } @@ -292,22 +223,6 @@ protected function create_viewport_element( $viewport ) { ); } - /** - * Check whether the charset is the correct one according to AMP requirements. - * - * @return bool Whether the charset is the correct one. - * @throws LengthException If the charset meta tag was not previously retrieved. - */ - protected function is_correct_charset() { - if ( empty( $this->meta_tags[ self::TAG_CHARSET ] ) ) { - throw new LengthException( 'Failed to ensure a charset meta tag is present' ); - } - - $charset_element = $this->meta_tags[ self::TAG_CHARSET ][0]; - - return static::AMP_CHARSET === strtolower( $charset_element->getAttribute( 'charset' ) ); - } - /** * Re-add the meta tags to the node in the optimized order. * From 9ba845c2da61afccd0eed7389bd078299e1da76d Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Wed, 27 Nov 2019 18:14:24 +0100 Subject: [PATCH 30/67] Simplify meta sanitizer tests --- tests/php/test-class-amp-meta-sanitizer.php | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/tests/php/test-class-amp-meta-sanitizer.php b/tests/php/test-class-amp-meta-sanitizer.php index d35498fd406..f3d17fa971d 100644 --- a/tests/php/test-class-amp-meta-sanitizer.php +++ b/tests/php/test-class-amp-meta-sanitizer.php @@ -63,20 +63,6 @@ public function get_data_for_sanitize() { '', ], - // Turn HTML 4 charset tag into HTML 5 charset tag. - [ - '', - '', - ], - [ - '', - '', - ], - [ - '', - '', - ], - // Concatenate and reposition script hashes. [ '', From c7ae4c398d46bfa4df87bde0f1736729e52ffb4a Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Wed, 27 Nov 2019 18:14:42 +0100 Subject: [PATCH 31/67] Use convenience of extended DOM document --- includes/sanitizers/class-amp-base-sanitizer.php | 2 +- includes/sanitizers/class-amp-meta-sanitizer.php | 13 +++---------- includes/utils/class-amp-dom-document.php | 13 ++++++++++--- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/includes/sanitizers/class-amp-base-sanitizer.php b/includes/sanitizers/class-amp-base-sanitizer.php index b93da28fd99..c91913cbbc7 100644 --- a/includes/sanitizers/class-amp-base-sanitizer.php +++ b/includes/sanitizers/class-amp-base-sanitizer.php @@ -31,7 +31,7 @@ abstract class AMP_Base_Sanitizer { /** * DOM. * - * @var DOMDocument A standard PHP representation of an HTML document in object form. + * @var AMP_DOM_Document A standard PHP representation of an HTML document in object form. * * @since 0.2 */ diff --git a/includes/sanitizers/class-amp-meta-sanitizer.php b/includes/sanitizers/class-amp-meta-sanitizer.php index 6d61d7c8b3b..a74dd563e56 100644 --- a/includes/sanitizers/class-amp-meta-sanitizer.php +++ b/includes/sanitizers/class-amp-meta-sanitizer.php @@ -30,13 +30,6 @@ class AMP_Meta_Sanitizer extends AMP_Base_Sanitizer { 'use_document_element' => true, // We want to work on the header, so we need the entire document. ]; - /** - * The document's element. - * - * @var DOMElement - */ - protected $head; - /* * Tags array keys. */ @@ -84,7 +77,7 @@ class AMP_Meta_Sanitizer extends AMP_Base_Sanitizer { * Sanitize. */ public function sanitize() { - $elements = $this->dom->getElementsByTagName( static::$tag ); + $elements = $this->dom->getElementsByTagName( static::$tag ); // Remove all nodes for easy reordering later on. $elements = array_map( @@ -243,9 +236,9 @@ protected function re_add_meta_tags_in_optimized_order() { foreach ( $this->meta_tags as $meta_tag_group ) { foreach ( $meta_tag_group as $meta_tag ) { if ( $previous_meta_tag ) { - $previous_meta_tag = $this->head->insertBefore( $meta_tag, $previous_meta_tag->nextSibling ); + $previous_meta_tag = $this->dom->head->insertBefore( $meta_tag, $previous_meta_tag->nextSibling ); } else { - $previous_meta_tag = $this->head->insertBefore( $meta_tag, $this->head->firstChild ); + $previous_meta_tag = $this->dom->head->insertBefore( $meta_tag, $this->dom->head->firstChild ); } } } diff --git a/includes/utils/class-amp-dom-document.php b/includes/utils/class-amp-dom-document.php index 3aa03912e02..9baf7016c5f 100644 --- a/includes/utils/class-amp-dom-document.php +++ b/includes/utils/class-amp-dom-document.php @@ -10,7 +10,9 @@ * * @since 1.5 * - * @property DOMXPath $xpath XPath query object for this document. + * @property DOMXPath $xpath XPath query object for this document. + * @property DOMElement $head The document's element. + * @property DOMElement $body The document's element. * * Abstract away some of the difficulties of working with PHP's DOMDocument. */ @@ -144,7 +146,6 @@ public function loadHTML( $source, $options = 0 ) { public function saveHTML( DOMNode $node = null ) { // Force-add http-equiv charset to make DOMDocument behave as it should. // See: http://php.net/manual/en/domdocument.loadhtml.php#78243. - $head = $this->getElementsByTagName( 'head' )->item( 0 ); $charset = AMP_DOM_Utils::create_node( $this, 'meta', @@ -153,7 +154,7 @@ public function saveHTML( DOMNode $node = null ) { 'content' => 'text/html; charset=' . self::AMP_ENCODING, ] ); - $head->insertBefore( $charset, $head->firstChild ); + $this->head->insertBefore( $charset, $this->head->firstChild ); return preg_replace( sprintf( @@ -408,6 +409,12 @@ public function __get( $name ) { case 'xpath': $this->xpath = new DOMXPath( $this ); return $this->xpath; + case 'head': + $this->head = $this->getElementsByTagName( 'head' )->item( 0 ); + return $this->head; + case 'body': + $this->body = $this->getElementsByTagName( 'body' )->item( 0 ); + return $this->body; } // Mimic regular PHP behavior for missing notices. From 71a3e1fdd03a7cff78d4f4ead9eb55f10fb96034 Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Wed, 27 Nov 2019 18:52:17 +0100 Subject: [PATCH 32/67] Move theme support and dom utils to new dom document --- includes/class-amp-theme-support.php | 83 +++++++++------------- includes/utils/class-amp-dom-utils.php | 46 +++++------- tests/php/test-class-amp-theme-support.php | 2 +- 3 files changed, 53 insertions(+), 78 deletions(-) diff --git a/includes/class-amp-theme-support.php b/includes/class-amp-theme-support.php index 95aca3438b0..2574249a46e 100644 --- a/includes/class-amp-theme-support.php +++ b/includes/class-amp-theme-support.php @@ -1600,10 +1600,10 @@ public static function filter_admin_bar_script_loader_tag( $tag, $handle ) { * @link https://amp.dev/documentation/guides-and-tutorials/optimize-and-measure/optimize_amp/ * @todo All of this might be better placed inside of a sanitizer. * - * @param DOMDocument $dom Document. - * @param string[] $script_handles AMP script handles for components identified during output buffering. + * @param AMP_DOM_Document $dom Document. + * @param string[] $script_handles AMP script handles for components identified during output buffering. */ - public static function ensure_required_markup( DOMDocument $dom, $script_handles = [] ) { + public static function ensure_required_markup( AMP_DOM_Document $dom, $script_handles = [] ) { /** * Elements. * @@ -1614,18 +1614,14 @@ public static function ensure_required_markup( DOMDocument $dom, $script_handles * @var DOMElement $noscript */ - $xpath = new DOMXPath( $dom ); - - $head = $dom->getElementsByTagName( 'head' )->item( 0 ); - // Ensure there is a schema.org script in the document. // @todo Consider applying the amp_schemaorg_metadata filter on the contents when a script is already present. - $schema_org_meta_script = $xpath->query( '//script[ @type = "application/ld+json" ][ contains( ./text(), "schema.org" ) ]' )->item( 0 ); + $schema_org_meta_script = $dom->xpath->query( '//script[ @type = "application/ld+json" ][ contains( ./text(), "schema.org" ) ]' )->item( 0 ); if ( ! $schema_org_meta_script ) { $script = $dom->createElement( 'script' ); $script->setAttribute( 'type', 'application/ld+json' ); $script->appendChild( $dom->createTextNode( wp_json_encode( amp_get_schemaorg_metadata(), JSON_UNESCAPED_UNICODE ) ) ); - $head->appendChild( $script ); + $dom->head->appendChild( $script ); } // Gather all links. @@ -1642,7 +1638,7 @@ public static function ensure_required_markup( DOMDocument $dom, $script_handles ), ], ]; - $link_elements = $head->getElementsByTagName( 'link' ); + $link_elements = $dom->head->getElementsByTagName( 'link' ); foreach ( $link_elements as $link ) { if ( $link->hasAttribute( 'rel' ) ) { $links[ $link->getAttribute( 'rel' ) ][] = $link; @@ -1660,18 +1656,18 @@ public static function ensure_required_markup( DOMDocument $dom, $script_handles 'href' => self::get_current_canonical_url(), ] ); - $head->appendChild( $rel_canonical ); + $dom->head->appendChild( $rel_canonical ); } // Store the last meta tag as the previous node to append to. - $meta_tags = $head->getElementsByTagName( 'meta' ); - $previous_node = $meta_tags->length > 0 ? $meta_tags->item( $meta_tags->length - 1 ) : $head->firstChild; + $meta_tags = $dom->head->getElementsByTagName( 'meta' ); + $previous_node = $meta_tags->length > 0 ? $meta_tags->item( $meta_tags->length - 1 ) : $dom->head->firstChild; // Handle the title. - $title = $head->getElementsByTagName( 'title' )->item( 0 ); + $title = $dom->head->getElementsByTagName( 'title' )->item( 0 ); if ( $title ) { $title->parentNode->removeChild( $title ); // So we can move it. - $head->insertBefore( $title, $previous_node->nextSibling ); + $dom->head->insertBefore( $title, $previous_node->nextSibling ); $previous_node = $title; } @@ -1687,7 +1683,7 @@ public static function ensure_required_markup( DOMDocument $dom, $script_handles $ordered_scripts = []; $head_scripts = []; $runtime_src = wp_scripts()->registered['amp-runtime']->src; - foreach ( $head->getElementsByTagName( 'script' ) as $script ) { // Note that prepare_response() already moved body scripts to head. + foreach ( $dom->head->getElementsByTagName( 'script' ) as $script ) { // Note that prepare_response() already moved body scripts to head. $head_scripts[] = $script; } foreach ( $head_scripts as $script ) { @@ -1795,7 +1791,7 @@ public static function ensure_required_markup( DOMDocument $dom, $script_handles if ( $link->parentNode ) { $link->parentNode->removeChild( $link ); // So we can move it. } - $head->insertBefore( $link, $previous_node->nextSibling ); + $dom->head->insertBefore( $link, $previous_node->nextSibling ); $previous_node = $link; } } @@ -1829,25 +1825,25 @@ public static function ensure_required_markup( DOMDocument $dom, $script_handles */ $ordered_scripts = array_merge( $ordered_scripts, $amp_scripts ); foreach ( $ordered_scripts as $ordered_script ) { - $head->insertBefore( $ordered_script, $previous_node->nextSibling ); + $dom->insertBefore( $ordered_script, $previous_node->nextSibling ); $previous_node = $ordered_script; } /* * "8. Specify any custom styles by using the ', $content ); diff --git a/tests/php/test-amp-style-sanitizer.php b/tests/php/test-amp-style-sanitizer.php index 8d339b22866..3db2c283785 100644 --- a/tests/php/test-amp-style-sanitizer.php +++ b/tests/php/test-amp-style-sanitizer.php @@ -574,7 +574,7 @@ static function( $preempt, $request, $url ) { 10, 3 ); - $dom = AMP_DOM_Utils::get_dom( $source ); + $dom = Document::from_html( $source ); $error_codes = []; $args = [ @@ -590,7 +590,7 @@ static function( $preempt, $request, $url ) { $whitelist_sanitizer = new AMP_Tag_And_Attribute_Sanitizer( $dom, $args ); $whitelist_sanitizer->sanitize(); - $sanitized_html = AMP_DOM_Utils::get_content_from_dom_node( $dom, $dom->documentElement ); + $sanitized_html = $dom->saveHTML( $dom->documentElement ); $actual_stylesheets = array_values( $sanitizer->get_stylesheets() ); $this->assertEquals( $expected_errors, $error_codes ); $this->assertCount( count( $expected_stylesheets ), $actual_stylesheets ); @@ -725,7 +725,7 @@ public function get_amp_selector_data() { */ public function test_amp_selector_conversion( $markup, $input, $output ) { $html = "$markup"; - $dom = AMP_DOM_Utils::get_dom( $html ); + $dom = Document::from_html( $html ); $sanitizer_classes = amp_get_content_sanitizers(); $sanitized = AMP_Content_Sanitizer::sanitize_document( @@ -871,7 +871,7 @@ static function ( $selector ) { ); $html = "$markup"; - $dom = AMP_DOM_Utils::get_dom( $html ); + $dom = Document::from_html( $html ); $sanitizer_classes = amp_get_content_sanitizers(); @@ -1064,7 +1064,7 @@ public function get_amp_css_hacks_data() { */ public function test_browser_css_hacks( $input ) { $html = ""; - $dom = AMP_DOM_Utils::get_dom( $html ); + $dom = Document::from_html( $html ); $error_codes = []; $sanitizer = new AMP_Style_Sanitizer( @@ -1094,7 +1094,7 @@ public function test_font_data_url_handling() { $html .= ''; // Test with tree-shaking. - $dom = AMP_DOM_Utils::get_dom( $html ); + $dom = Document::from_html( $html ); $error_codes = []; $sanitizer = new AMP_Style_Sanitizer( $dom, @@ -1137,7 +1137,7 @@ public function test_font_data_url_handling_without_file_sources() { $html .= ''; $html .= ''; - $dom = AMP_DOM_Utils::get_dom( $html ); + $dom = Document::from_html( $html ); $error_codes = []; $sanitizer = new AMP_Style_Sanitizer( $dom, @@ -1204,7 +1204,7 @@ public function test_class_amp_bind_preservation() { '; $html .= '...'; - $dom = AMP_DOM_Utils::get_dom( $html ); + $dom = Document::from_html( $html ); $error_codes = []; $sanitizer = new AMP_Style_Sanitizer( @@ -1345,7 +1345,7 @@ public function test_css_manifest() { $html = ob_get_clean(); $error_codes = []; - $dom = AMP_DOM_Utils::get_dom( $html ); + $dom = Document::from_html( $html ); $sanitizer = new AMP_Style_Sanitizer( $dom, array_merge( @@ -1359,8 +1359,7 @@ public function test_css_manifest() { ) ); $sanitizer->sanitize(); - $xpath = new DOMXPath( $dom ); - $style = $xpath->query( '//style[ @amp-custom ]' )->item( 0 ); + $style = $dom->xpath->query( '//style[ @amp-custom ]' )->item( 0 ); return [ $style, $error_codes ]; }; @@ -1434,7 +1433,7 @@ public function test_css_manifest() { */ public function test_relative_background_url_handling() { $html = ''; // phpcs:ignore WordPress.WP.EnqueuedResources.NonEnqueuedStylesheet - $dom = AMP_DOM_Utils::get_dom( $html ); + $dom = Document::from_html( $html ); $sanitizer = new AMP_Style_Sanitizer( $dom, @@ -1443,7 +1442,7 @@ public function test_relative_background_url_handling() { ] ); $sanitizer->sanitize(); - AMP_DOM_Utils::get_content_from_dom_node( $dom, $dom->documentElement ); + $dom->saveHTML( $dom->documentElement ); $actual_stylesheets = array_values( $sanitizer->get_stylesheets() ); $this->assertCount( 1, $actual_stylesheets ); $stylesheet = $actual_stylesheets[0]; @@ -1520,7 +1519,7 @@ static function( $preempt, $request, $url ) use ( $href, &$request_count, $conte $sanitize_and_get_stylesheets = static function() use ( $href ) { $html = sprintf( '', esc_url( $href ) ); // phpcs:ignore WordPress.WP.EnqueuedResources.NonEnqueuedStylesheet - $dom = AMP_DOM_Utils::get_dom( $html ); + $dom = Document::from_html( $html ); $found_error_codes = []; @@ -1534,7 +1533,7 @@ static function( $preempt, $request, $url ) use ( $href, &$request_count, $conte ] ); $sanitizer->sanitize(); - AMP_DOM_Utils::get_content_from_dom_node( $dom, $dom->documentElement ); + $dom->saveHTML( $dom->documentElement ); return [ $found_error_codes, array_values( $sanitizer->get_stylesheets() ) ]; }; @@ -1749,7 +1748,7 @@ public function get_stylesheet_urls() { * @param string $error_code Error code. Optional. */ public function test_get_validated_url_file_path( $source, $expected, $error_code = null ) { - $dom = AMP_DOM_Utils::get_dom( '' ); + $dom = Document::from_html( '' ); $sanitizer = new AMP_Style_Sanitizer( $dom ); $actual = $sanitizer->get_validated_url_file_path( $source, [ 'css' ] ); @@ -1810,7 +1809,7 @@ public function test_remove_spaces_from_url_values( $source, $expected ) { $html .= $source; $html .= ''; - $dom = AMP_DOM_Utils::get_dom( $html ); + $dom = Document::from_html( $html ); $sanitizer = new AMP_Style_Sanitizer( $dom ); $sanitizer->sanitize(); @@ -1866,7 +1865,7 @@ public function test_font_urls( $url, $error_codes ) { $tag = sprintf( '', esc_url( $url ) ); // phpcs:ignore WordPress.WP.EnqueuedResources.NonEnqueuedStylesheet $tag = amp_filter_font_style_loader_tag_with_crossorigin_anonymous( $tag, 'font', $url ); - $dom = AMP_DOM_Utils::get_dom( sprintf( '%s', $tag ) ); + $dom = Document::from_html( sprintf( '%s', $tag ) ); $validation_errors = []; @@ -1907,7 +1906,7 @@ public function test_cors_enabled_stylesheet_url() { // Test supplying crossorigin attribute. $url = 'https://fonts.googleapis.com/css?family=Tangerine'; $link = amp_filter_font_style_loader_tag_with_crossorigin_anonymous( "", 'font', $url ); // phpcs:ignore WordPress.WP.EnqueuedResources.NonEnqueuedStylesheet - $document = AMP_DOM_Utils::get_dom( "$link" ); + $document = Document::from_html( "$link" ); $sanitizer = new AMP_Style_Sanitizer( $document, [ 'use_document_element' => true ] ); $sanitizer->sanitize(); $link = $document->getElementsByTagName( 'link' )->item( 0 ); @@ -1916,7 +1915,7 @@ public function test_cors_enabled_stylesheet_url() { // Test that existing crossorigin attribute is not overridden. $link = amp_filter_font_style_loader_tag_with_crossorigin_anonymous( "", 'font', $url ); // phpcs:ignore WordPress.WP.EnqueuedResources.NonEnqueuedStylesheet - $document = AMP_DOM_Utils::get_dom( "$link" ); // phpcs:ignore WordPress.WP.EnqueuedResources.NonEnqueuedStylesheet + $document = Document::from_html( "$link" ); // phpcs:ignore WordPress.WP.EnqueuedResources.NonEnqueuedStylesheet $sanitizer = new AMP_Style_Sanitizer( $document, [ 'use_document_element' => true ] ); $sanitizer->sanitize(); $link = $document->getElementsByTagName( 'link' )->item( 0 ); @@ -2151,7 +2150,7 @@ static function( $preempt, $request, $url ) use ( $mock_response, $stylesheet_ur 3 ); - $dom = AMP_DOM_Utils::get_dom( $markup ); + $dom = Document::from_html( $markup ); if ( ! empty( $options['auto_reject'] ) ) { add_filter( 'amp_validation_error_sanitized', '__return_false' ); @@ -2185,7 +2184,7 @@ public function test_css_import_font() { $markup .= sprintf( '', $stylesheet_url ); $markup .= 'hello'; - $dom = AMP_DOM_Utils::get_dom( $markup ); + $dom = Document::from_html( $markup ); $sanitizer = new AMP_Style_Sanitizer( $dom, [ @@ -2197,8 +2196,7 @@ public function test_css_import_font() { $this->assertCount( 1, $stylesheets ); $this->assertEquals( 'body{color:red}', $stylesheets[0] ); - $xpath = new DOMXPath( $dom ); - $link = $xpath->query( '//link[ @rel = "stylesheet" ]' )->item( 0 ); + $link = $dom->xpath->query( '//link[ @rel = "stylesheet" ]' )->item( 0 ); $this->assertInstanceOf( 'DOMElement', $link ); $this->assertEquals( set_url_scheme( $stylesheet_url, 'https' ), $link->getAttribute( 'href' ) ); } @@ -2247,7 +2245,7 @@ public function test_style_element_cdata() { $html .= ''; $html .= '

Hello World

'; - $dom = AMP_DOM_Utils::get_dom( $html ); + $dom = Document::from_html( $html ); $sanitizer = new AMP_Style_Sanitizer( $dom, [ @@ -2257,8 +2255,7 @@ public function test_style_element_cdata() { $sanitizer->sanitize(); - $xpath = new DOMXPath( $dom ); - $style = $xpath->query( '//style[ @amp-custom ]' )->item( 0 ); + $style = $dom->xpath->query( '//style[ @amp-custom ]' )->item( 0 ); $this->assertInstanceOf( 'DOMElement', $style ); $expected = "body{color:red}body{color:green}body{color:blue}\n\n/*# sourceURL=amp-custom.css */"; @@ -2272,7 +2269,7 @@ public function test_style_element_cdata() { */ public function test_body_font_stylesheet_moved_to_head() { $html = ''; // phpcs:ignore - $dom = AMP_DOM_Utils::get_dom( $html ); + $dom = Document::from_html( $html ); $link = $dom->getElementById( 'the-font' ); $this->assertInstanceOf( 'DOMElement', $link ); @@ -2362,7 +2359,7 @@ function( $original_dom, $original_source, $amphtml_dom, $amphtml_source ) { */ $this->assertInstanceOf( 'DOMElement', $original_dom->getElementById( 'wpadminbar' ), 'Expected admin bar element to be present originally.' ); $this->assertInstanceOf( 'DOMElement', $original_dom->getElementById( 'admin-bar-css' ), 'Expected admin bar CSS to be present originally.' ); - $this->assertContains( 'admin-bar', $original_dom->getElementsByTagName( 'body' )->item( 0 )->getAttribute( 'class' ) ); + $this->assertContains( 'admin-bar', $original_dom->body->getAttribute( 'class' ) ); $this->assertContains( 'earlyprintstyle', $original_source, 'Expected early print style to not be present.' ); $this->assertContains( '.is-style-outline .wp-block-button__link', $amphtml_source, 'Expected block-library/style.css' ); @@ -2370,7 +2367,7 @@ function( $original_dom, $original_source, $amphtml_dom, $amphtml_source ) { $this->assertContains( 'amp-img.amp-wp-enforced-sizes', $amphtml_source, 'Expected amp-default.css' ); $this->assertContains( 'ab-empty-item', $amphtml_source, 'Expected admin-bar.css to still be present.' ); $this->assertNotContains( 'earlyprintstyle', $amphtml_source, 'Expected early print style to not be present.' ); - $this->assertContains( 'admin-bar', $amphtml_dom->getElementsByTagName( 'body' )->item( 0 )->getAttribute( 'class' ) ); + $this->assertContains( 'admin-bar', $amphtml_dom->body->getAttribute( 'class' ) ); $this->assertInstanceOf( 'DOMElement', $amphtml_dom->getElementById( 'wpadminbar' ) ); }, ], @@ -2408,7 +2405,7 @@ public function test_prioritized_stylesheets( $html_generator, $assert ) { $this->go_to( home_url() ); $html = $html_generator(); - $original_dom = AMP_DOM_Utils::get_dom( $html ); + $original_dom = Document::from_html( $html ); $amphtml_dom = clone $original_dom; $error_codes = []; @@ -2628,7 +2625,7 @@ public function test_get_stylesheet_priority( $node_data, $expected ) { return 'child-theme'; }; - $dom = new DOMDocument(); + $dom = new Document(); if ( isset( $node_data[0] ) ) { $node = AMP_DOM_Utils::create_node( $dom, $node_data[0], $node_data[1] ); diff --git a/tests/php/test-class-amp-comments-sanitizer.php b/tests/php/test-class-amp-comments-sanitizer.php index 37c52c6e373..67cf94db0ad 100644 --- a/tests/php/test-class-amp-comments-sanitizer.php +++ b/tests/php/test-class-amp-comments-sanitizer.php @@ -20,7 +20,7 @@ class Test_AMP_Comments_Sanitizer extends WP_UnitTestCase { /** * Representation of the DOM. * - * @var DOMDocument + * @var Document */ public $dom; @@ -122,8 +122,7 @@ public function test_add_amp_live_list_comment_attributes() { $this->create_comments_list( $comment_objects ); $instance->sanitize(); - $xpath = new DOMXPath( $this->dom ); - $comments = $xpath->query( '//*[ starts-with( @id, "comment-" ) ]' ); + $comments = $this->dom->xpath->query( '//*[ starts-with( @id, "comment-" ) ]' ); foreach ( $comments as $comment ) { /** diff --git a/tests/php/test-class-amp-core-theme-sanitizer.php b/tests/php/test-class-amp-core-theme-sanitizer.php index 260be683d3b..af626d21c83 100644 --- a/tests/php/test-class-amp-core-theme-sanitizer.php +++ b/tests/php/test-class-amp-core-theme-sanitizer.php @@ -48,7 +48,7 @@ public function test_xpath_from_css_selector( $css_selector, $expected ) { } public function get_get_closest_submenu_data() { - $html = ' + $html = ' '; - $dom = AMP_DOM_Utils::get_dom_from_content( $html ); - $xpath = new DOMXPath( $dom ); + $dom = AMP_DOM_Utils::get_dom_from_content( $html ); return [ // First sub-menu. - [ $dom, $xpath, $xpath->query( "//*[ @id = 'menu-item-2' ]" )->item( 0 ), $xpath->query( "//*[ @id = 'sub-menu-1' ]" )->item( 0 ) ], + [ $dom, $dom->xpath->query( "//*[ @id = 'menu-item-2' ]" )->item( 0 ), $dom->xpath->query( "//*[ @id = 'sub-menu-1' ]" )->item( 0 ) ], // Second sub-menu. - [ $dom, $xpath, $xpath->query( "//*[ @id = 'menu-item-5' ]" )->item( 0 ), $xpath->query( "//*[ @id = 'sub-menu-2' ]" )->item( 0 ) ], + [ $dom, $dom->xpath->query( "//*[ @id = 'menu-item-5' ]" )->item( 0 ), $dom->xpath->query( "//*[ @id = 'sub-menu-2' ]" )->item( 0 ) ], // Sub-menu of second sub-menu. - [ $dom, $xpath, $xpath->query( "//*[ @id = 'menu-item-6' ]" )->item( 0 ), $xpath->query( "//*[ @id = 'sub-menu-3' ]" )->item( 0 ) ], + [ $dom, $dom->xpath->query( "//*[ @id = 'menu-item-6' ]" )->item( 0 ), $dom->xpath->query( "//*[ @id = 'sub-menu-3' ]" )->item( 0 ) ], ]; } @@ -92,10 +91,9 @@ public function get_get_closest_submenu_data() { * @dataProvider get_get_closest_submenu_data * @covers AMP_Core_Theme_Sanitizer::get_closest_submenu() */ - public function test_get_closest_submenu( $dom, $xpath, $element, $expected ) { + public function test_get_closest_submenu( $dom, $element, $expected ) { $sanitizer = new AMP_Core_Theme_Sanitizer( $dom ); - $this->set_private_property( $sanitizer, 'xpath', $xpath ); - $actual = $this->call_private_method( $sanitizer, 'get_closest_submenu', [ $element ] ); + $actual = $this->call_private_method( $sanitizer, 'get_closest_submenu', [ $element ] ); $this->assertEquals( $expected, $actual ); } } diff --git a/tests/php/test-class-amp-dom-document.php b/tests/php/test-class-amp-dom-document.php index ac04a9cd6f8..2ceeee21410 100644 --- a/tests/php/test-class-amp-dom-document.php +++ b/tests/php/test-class-amp-dom-document.php @@ -142,9 +142,7 @@ public function data_dom_document() { * @covers Document::saveHTML() */ public function test_dom_document( $charset, $source, $expected ) { - $document = new Document( '', $charset ); - $document->loadHTML( $source ); - + $document = Document::from_html( $source ); $this->assertEqualMarkup( $expected, $document->saveHTML() ); } @@ -172,25 +170,20 @@ public function assertEqualMarkup( $expected, $actual ) { * @covers \Amp\AmpWP\Dom\Document::convert_amp_bind_attributes() */ public function test_amp_bind_conversion() { - $original = ''; - $dom = new Document(); - $dom->loadHTML( $original ); - $converted = $dom->saveHTML(); + $original = ''; + $converted = Document::from_html( $original )->saveHTML(); $this->assertNotEquals( $original, $converted ); $this->assertContains( Document::AMP_BIND_DATA_ATTR_PREFIX . 'src="myAnimals[currentAnimal].imageUrl"', $converted ); $this->assertContains( 'width="300" height="200" data-foo="bar" selected', $converted ); // Check tag with self-closing attribute. - $original = ''; - $dom = new Document(); - $dom->loadHTML( $original ); - $converted = $dom->saveHTML(); + $original = ''; + $converted = Document::from_html( $original )->saveHTML(); $this->assertNotEquals( $original, $converted ); // Preserve trailing slash that is actually the attribute value. - $original = 'Home'; - $dom = new Document(); - $dom->loadHTML( $original ); + $original = 'Home'; + $dom = Document::from_html( $original ); $converted = $dom->saveHTML( $dom->body->firstChild ); $this->assertEquals( 'Home', $converted ); @@ -201,9 +194,7 @@ public function test_amp_bind_conversion() { '', ]; foreach ( $malformed_html as $html ) { - $dom = new Document(); - $dom->loadHTML( $html ); - $converted = $dom->saveHTML(); + $converted = Document::from_html( $html )->saveHTML(); $this->assertNotContains( Document::AMP_BIND_DATA_ATTR_PREFIX, $converted, "Source: {$html}" ); } } @@ -256,10 +247,73 @@ public function test_attribute_conversion_on_long_iframe_srcdocs( $iterations ) htmlentities( $html ) ); - $dom = new Document(); - $dom->loadHTML( $to_convert ); - $dom->saveHTML(); + Document::from_html( $to_convert )->saveHTML(); $this->assertSame( PREG_NO_ERROR, preg_last_error(), 'Probably failed when backtrack limit was exhausted.' ); } + + /** + * Test that HEAD and BODY elements are always present. + * + * @covers \Amp\AmpWP\Dom\Document::normalize_document_structure() + */ + public function test_ensuring_head_body() { + $html = '

Hello

'; + $dom = Document::from_html( $html ); + $this->assertEquals( 'head', $dom->documentElement->firstChild->nodeName ); + $this->assertEquals( 0, $dom->documentElement->firstChild->childNodes->length ); + $this->assertEquals( 'body', $dom->documentElement->lastChild->nodeName ); + $this->assertEquals( $dom->documentElement->lastChild, $dom->getElementsByTagName( 'p' )->item( 0 )->parentNode ); + + $html = 'foo'; + $dom = Document::from_html( $html ); + $this->assertEquals( 'head', $dom->documentElement->firstChild->nodeName ); + $this->assertEquals( $dom->documentElement->firstChild, $dom->getElementsByTagName( 'title' )->item( 0 )->parentNode ); + $this->assertEquals( 'body', $dom->documentElement->lastChild->nodeName ); + $this->assertEquals( 0, $dom->documentElement->lastChild->childNodes->length ); + + $html = 'foo

no body

'; + $dom = Document::from_html( $html ); + $this->assertEquals( 'head', $dom->documentElement->firstChild->nodeName ); + $this->assertEquals( $dom->documentElement->firstChild, $dom->getElementsByTagName( 'title' )->item( 0 )->parentNode ); + $p = $dom->getElementsByTagName( 'p' )->item( 0 ); + $this->assertEquals( $dom->documentElement->lastChild, $p->parentNode ); + $this->assertEquals( 'no body', $p->textContent ); + + $html = 'Hello world'; + $dom = Document::from_html( $html ); + $this->assertEquals( 'head', $dom->documentElement->firstChild->nodeName ); + $this->assertEquals( 0, $dom->documentElement->firstChild->childNodes->length ); + $this->assertEquals( 'body', $dom->documentElement->lastChild->nodeName ); + $this->assertEquals( 'Hello world', $dom->documentElement->lastChild->lastChild->textContent ); + } + + + /** + * Test that invalid head nodes are moved to body. + * + * @covers \Amp\AmpWP\Dom\Document::move_invalid_head_nodes_to_body() + */ + public function test_invalid_head_nodes() { + + // Text node. + $html = 'textend'; + $dom = Document::from_html( $html ); + $this->assertNull( $dom->head->firstChild ); + $body_first_child = $dom->body->firstChild; + $this->assertInstanceOf( 'DOMElement', $body_first_child ); + $this->assertEquals( 'text', $body_first_child->textContent ); + + // Valid nodes. + $html = 'a'; + $dom = Document::from_html( $html ); + $this->assertEquals( 8, $dom->head->childNodes->length ); + $this->assertNull( $dom->body->firstChild ); + + // Invalid nodes. + $html = '

hi

'; + $dom = Document::from_html( $html ); + $this->assertNull( $dom->head->firstChild ); + $this->assertEquals( 6, $dom->body->childNodes->length ); + } } diff --git a/tests/php/test-class-amp-dom-utils.php b/tests/php/test-class-amp-dom-utils.php index cd391577bfe..15b2e179597 100644 --- a/tests/php/test-class-amp-dom-utils.php +++ b/tests/php/test-class-amp-dom-utils.php @@ -163,7 +163,7 @@ public function test__get_content_from_dom__br_no_closing_tag() { /** * Test handling of empty elements. * - * @covers \AMP_DOM_Utils::get_dom() + * @covers \Amp\AmpWP\Dom\Document::from_html() * @covers \AMP_DOM_Utils::get_content_from_dom_node() */ public function test_html5_empty_elements() { @@ -191,7 +191,7 @@ public function test_html5_empty_elements() { /** * Test parsing DOM with Mustache or Mustache-like templates. * - * @covers \AMP_DOM_Utils::get_dom() + * @covers \Amp\AmpWP\Dom\Document::from_html() * @covers \AMP_DOM_Utils::get_content_from_dom_node() */ public function test_mustache_replacements() { @@ -224,11 +224,10 @@ public function test_mustache_replacements() { ] ); - $dom = AMP_DOM_Utils::get_dom_from_content( $html ); - $xpath = new DOMXPath( $dom ); + $dom = AMP_DOM_Utils::get_dom_from_content( $html ); // Ensure that JSON in scripts are left intact. - $script = $xpath->query( '//script' )->item( 0 ); + $script = $dom->xpath->query( '//script' )->item( 0 ); $this->assertEquals( $data, json_decode( $script->nodeValue, true ) @@ -242,19 +241,19 @@ public function test_mustache_replacements() { * @var DOMElement $template_img * @var DOMElement $template_blockquote */ - $template_link = $xpath->query( '//template/a' )->item( 0 ); + $template_link = $dom->xpath->query( '//template/a' )->item( 0 ); $this->assertSame( '{{href}}', $template_link->getAttribute( 'href' ) ); $this->assertEquals( 'Hello {{name}}', $template_link->getAttribute( 'title' ) ); // Ensure that mustache var in img[src] attribute is intact. - $template_img = $xpath->query( '//template/a/img' )->item( 0 ); + $template_img = $dom->xpath->query( '//template/a/img' )->item( 0 ); $this->assertEquals( '{{src}}', $template_img->getAttribute( 'src' ) ); // Ensure that mustache var in blockquote[cite] is not changed. - $template_blockquote = $xpath->query( '//template/blockquote' )->item( 0 ); + $template_blockquote = $dom->xpath->query( '//template/blockquote' )->item( 0 ); $this->assertEquals( '{{cite}}', $template_blockquote->getAttribute( 'cite' ) ); - $serialized_html = AMP_DOM_Utils::get_content_from_dom_node( $dom, $dom->documentElement ); + $serialized_html = $dom->saveHTML( $dom->documentElement ); $this->assertContains( '', $serialized_html ); $this->assertContains( '', $serialized_html ); @@ -265,7 +264,7 @@ public function test_mustache_replacements() { /** * Test encoding. * - * @covers \AMP_DOM_Utils::get_dom() + * @covers \Amp\AmpWP\Dom\Document::from_html() */ public function test_get_dom_encoding() { $html = 'مرحبا بالعالم! Check out ‘this’ and “that” and—other things.'; @@ -274,7 +273,7 @@ public function test_get_dom_encoding() { $html .= '

مرحبا بالعالم! Check out ‘this’ and “that” and—other things.

'; $html .= ''; - $document = AMP_DOM_Utils::get_dom( $html ); + $document = Document::from_html( $html ); $this->assertEquals( 'utf-8', $document->encoding ); $paragraphs = $document->getElementsByTagName( 'p' ); @@ -295,51 +294,15 @@ public function test_whitespace_preservation() { $body = " start
  • First
  • Second
\t* one\n\t* two\n\t* three
end "; $html = "$body"; - $dom = AMP_DOM_Utils::get_dom( "$html" ); + $dom = Document::from_html( "$html" ); - $output = AMP_DOM_Utils::get_content_from_dom_node( $dom, $dom->documentElement ); + $output = $dom->saveHTML( $dom->documentElement ); $this->assertEquals( $html, $output ); $output = AMP_DOM_Utils::get_content_from_dom( $dom ); $this->assertEquals( $body, $output ); } - /** - * Test that HEAD and BODY elements are always present. - * - * @covers \AMP_DOM_Utils::get_dom() - */ - public function test_ensuring_head_body() { - $html = '

Hello

'; - $dom = AMP_DOM_Utils::get_dom( $html ); - $this->assertEquals( 'head', $dom->documentElement->firstChild->nodeName ); - $this->assertEquals( 0, $dom->documentElement->firstChild->childNodes->length ); - $this->assertEquals( 'body', $dom->documentElement->lastChild->nodeName ); - $this->assertEquals( $dom->documentElement->lastChild, $dom->getElementsByTagName( 'p' )->item( 0 )->parentNode ); - - $html = 'foo'; - $dom = AMP_DOM_Utils::get_dom( $html ); - $this->assertEquals( 'head', $dom->documentElement->firstChild->nodeName ); - $this->assertEquals( $dom->documentElement->firstChild, $dom->getElementsByTagName( 'title' )->item( 0 )->parentNode ); - $this->assertEquals( 'body', $dom->documentElement->lastChild->nodeName ); - $this->assertEquals( 0, $dom->documentElement->lastChild->childNodes->length ); - - $html = 'foo

no body

'; - $dom = AMP_DOM_Utils::get_dom( $html ); - $this->assertEquals( 'head', $dom->documentElement->firstChild->nodeName ); - $this->assertEquals( $dom->documentElement->firstChild, $dom->getElementsByTagName( 'title' )->item( 0 )->parentNode ); - $p = $dom->getElementsByTagName( 'p' )->item( 0 ); - $this->assertEquals( $dom->documentElement->lastChild, $p->parentNode ); - $this->assertEquals( 'no body', $p->textContent ); - - $html = 'Hello world'; - $dom = AMP_DOM_Utils::get_dom( $html ); - $this->assertEquals( 'head', $dom->documentElement->firstChild->nodeName ); - $this->assertEquals( 0, $dom->documentElement->firstChild->childNodes->length ); - $this->assertEquals( 'body', $dom->documentElement->lastChild->nodeName ); - $this->assertEquals( 'Hello world', $dom->documentElement->lastChild->lastChild->textContent ); - } - /** * Get head node data. * @@ -446,34 +409,6 @@ public function test_is_valid_head_node( $node, $valid ) { $this->assertEquals( $valid, AMP_DOM_Utils::is_valid_head_node( $node ) ); } - /** - * Test that invalid head nodes are moved to body. - * - * @covers \AMP_DOM_Utils::move_invalid_head_nodes_to_body() - */ - public function test_invalid_head_nodes() { - - // Text node. - $html = 'textend'; - $dom = AMP_DOM_Utils::get_dom( $html ); - $this->assertNull( $dom->getElementsByTagName( 'head' )->item( 0 )->firstChild ); - $body_first_child = $dom->getElementsByTagName( 'body' )->item( 0 )->firstChild; - $this->assertInstanceOf( 'DOMElement', $body_first_child ); - $this->assertEquals( 'text', $body_first_child->textContent ); - - // Valid nodes. - $html = 'a'; - $dom = AMP_DOM_Utils::get_dom( $html ); - $this->assertEquals( 8, $dom->getElementsByTagName( 'head' )->item( 0 )->childNodes->length ); - $this->assertNull( $dom->getElementsByTagName( 'body' )->item( 0 )->firstChild ); - - // Invalid nodes. - $html = '

hi

'; - $dom = AMP_DOM_Utils::get_dom( $html ); - $this->assertNull( $dom->getElementsByTagName( 'head' )->item( 0 )->firstChild ); - $this->assertEquals( 6, $dom->getElementsByTagName( 'body' )->item( 0 )->childNodes->length ); - } - public function get_has_class_data() { $dom = new Document(); diff --git a/tests/php/test-class-amp-link-sanitizer.php b/tests/php/test-class-amp-link-sanitizer.php index f1fed58531a..01cca805b5e 100644 --- a/tests/php/test-class-amp-link-sanitizer.php +++ b/tests/php/test-class-amp-link-sanitizer.php @@ -142,10 +142,9 @@ public function test_amp_to_amp_navigation( $paired ) { } // Confirm changes to form. - $xpath = new DOMXPath( $dom ); - $this->assertEquals( 1, $xpath->query( '//form[ @id = "internal-search" ]//input[ @name = "amp" ]' )->length ); - $this->assertEquals( 0, $xpath->query( '//form[ @id = "internal-post" ]//input[ @name = "amp" ]' )->length ); - $this->assertEquals( 0, $xpath->query( '//form[ @id = "external-search" ]//input[ @name = "amp" ]' )->length ); + $this->assertEquals( 1, $dom->xpath->query( '//form[ @id = "internal-search" ]//input[ @name = "amp" ]' )->length ); + $this->assertEquals( 0, $dom->xpath->query( '//form[ @id = "internal-post" ]//input[ @name = "amp" ]' )->length ); + $this->assertEquals( 0, $dom->xpath->query( '//form[ @id = "external-search" ]//input[ @name = "amp" ]' )->length ); } /** @@ -176,13 +175,12 @@ public function get_test_amp_to_amp_meta_tag_data() { * @param string $expected_meta Expected meta content. */ public function test_amp_to_amp_meta_tag( $sanitizer_args, $expected_meta ) { - $dom = AMP_DOM_Utils::get_dom_from_content( '
Hello
' ); - $xpath = new DOMXPath( $dom ); + $dom = AMP_DOM_Utils::get_dom_from_content( '
Hello
' ); $sanitizer = new AMP_Link_Sanitizer( $dom, $sanitizer_args ); $sanitizer->sanitize(); - $meta_tag = $xpath->query( "//meta[ @name = 'amp-to-amp-navigation' ]" )->item( 0 ); + $meta_tag = $dom->xpath->query( "//meta[ @name = 'amp-to-amp-navigation' ]" )->item( 0 ); $this->assertInstanceOf( 'DOMElement', $meta_tag ); $this->assertEquals( $expected_meta, $meta_tag->getAttribute( 'content' ) ); } diff --git a/tests/php/test-class-amp-meta-sanitizer.php b/tests/php/test-class-amp-meta-sanitizer.php index f3d17fa971d..0ea2c75af15 100644 --- a/tests/php/test-class-amp-meta-sanitizer.php +++ b/tests/php/test-class-amp-meta-sanitizer.php @@ -5,6 +5,8 @@ * @package AMP */ +use Amp\AmpWP\Dom\Document; + /** * Tests for AMP_Meta_Sanitizer. */ @@ -81,7 +83,7 @@ public function get_data_for_sanitize() { * @param string $expected_content Expected content after sanitization. */ public function test_sanitize( $source_content, $expected_content ) { - $dom = AMP_DOM_Utils::get_dom( $source_content ); + $dom = Document::from_html( $source_content ); $sanitizer = new AMP_Meta_Sanitizer( $dom ); $sanitizer->sanitize(); diff --git a/tests/php/test-class-amp-nav-menu-toggle-sanitizer.php b/tests/php/test-class-amp-nav-menu-toggle-sanitizer.php index be25ff21952..49b9839c739 100644 --- a/tests/php/test-class-amp-nav-menu-toggle-sanitizer.php +++ b/tests/php/test-class-amp-nav-menu-toggle-sanitizer.php @@ -5,6 +5,8 @@ * @package AMP */ +use Amp\AmpWP\Dom\Document; + /** * Tests for AMP_Nav_Menu_Toggle_Sanitizer. * @@ -104,12 +106,12 @@ public function data_converter() { * @covers AMP_Nav_Menu_Toggle_Sanitizer::get_menu_button() */ public function test_converter( $source, $expected, $args = [] ) { - $dom = AMP_DOM_Utils::get_dom( $source ); + $dom = Document::from_html( $source ); $sanitizer = new AMP_Nav_Menu_Toggle_Sanitizer( $dom, $args ); $sanitizer->sanitize(); - $content = AMP_DOM_Utils::get_content_from_dom_node( $dom, $dom->documentElement ); + $content = $dom->saveHTML( $dom->documentElement ); $this->assertEquals( $expected, $content ); } diff --git a/tests/php/test-class-amp-theme-support.php b/tests/php/test-class-amp-theme-support.php index 1975e375305..de3481eee3f 100644 --- a/tests/php/test-class-amp-theme-support.php +++ b/tests/php/test-class-amp-theme-support.php @@ -1600,10 +1600,9 @@ public function test_scripts_get_moved_to_head() { $html = ob_get_clean(); $html = AMP_Theme_Support::prepare_response( $html ); - $dom = AMP_DOM_Utils::get_dom( $html ); - $xpath = new DOMXPath( $dom ); + $dom = Document::from_html( $html ); - $scripts = $xpath->query( '//script[ not( @type ) or @type = "text/javascript" ]' ); + $scripts = $dom->xpath->query( '//script[ not( @type ) or @type = "text/javascript" ]' ); $this->assertSame( 3, $scripts->length ); foreach ( $scripts as $script ) { $this->assertSame( 'head', $script->parentNode->nodeName ); @@ -1660,12 +1659,11 @@ public function test_unneeded_scripts_get_removed() { $html = ob_get_clean(); $html = AMP_Theme_Support::prepare_response( $html ); - $dom = AMP_DOM_Utils::get_dom( $html ); - $xpath = new DOMXPath( $dom ); + $dom = Document::from_html( $html ); /** @var DOMElement $script Script. */ $actual_script_srcs = []; - foreach ( $xpath->query( '//script[ not( @type ) or @type = "text/javascript" ]' ) as $script ) { + foreach ( $dom->xpath->query( '//script[ not( @type ) or @type = "text/javascript" ]' ) as $script ) { $actual_script_srcs[] = $script->getAttribute( 'src' ); } @@ -1713,8 +1711,7 @@ public function test_duplicate_scripts_are_removed() { $html = ob_get_clean(); $html = AMP_Theme_Support::prepare_response( $html ); - $dom = AMP_DOM_Utils::get_dom( $html ); - $xpath = new DOMXPath( $dom ); + $dom = Document::from_html( $html ); $script_srcs = []; /** @@ -1722,7 +1719,7 @@ public function test_duplicate_scripts_are_removed() { * * @var DOMElement $script */ - $scripts = $xpath->query( '//script[ @src ]' ); + $scripts = $dom->xpath->query( '//script[ @src ]' ); foreach ( $scripts as $script ) { $script_srcs[] = $script->getAttribute( 'src' ); } diff --git a/tests/php/test-tag-and-attribute-sanitizer.php b/tests/php/test-tag-and-attribute-sanitizer.php index 163fcd3db27..b224e899e22 100644 --- a/tests/php/test-tag-and-attribute-sanitizer.php +++ b/tests/php/test-tag-and-attribute-sanitizer.php @@ -5,6 +5,8 @@ * @package AMP */ +use Amp\AmpWP\Dom\Document; + /** * Test AMP_Tag_And_Attribute_Sanitizer * @@ -2312,7 +2314,7 @@ public function get_html_data() { */ public function test_sanitize( $source, $expected = null, $expected_scripts = [], $expected_errors = [] ) { $expected = isset( $expected ) ? $expected : $source; - $dom = AMP_DOM_Utils::get_dom( $source ); + $dom = Document::from_html( $source ); $actual_errors = []; $sanitizer = new AMP_Tag_And_Attribute_Sanitizer( $dom, @@ -2325,7 +2327,7 @@ public function test_sanitize( $source, $expected = null, $expected_scripts = [] ] ); $sanitizer->sanitize(); - $content = AMP_DOM_Utils::get_content_from_dom_node( $dom, $dom->documentElement ); + $content = $dom->saveHTML( $dom->documentElement ); $this->assertEqualMarkup( $expected, $content ); $this->assertEqualSets( $expected_scripts, array_keys( $sanitizer->get_scripts() ) ); diff --git a/tests/php/validation/test-class-amp-validation-manager.php b/tests/php/validation/test-class-amp-validation-manager.php index 757a27f9951..f071206d5e3 100644 --- a/tests/php/validation/test-class-amp-validation-manager.php +++ b/tests/php/validation/test-class-amp-validation-manager.php @@ -816,8 +816,7 @@ public function test_source_comments() { * @var DOMComment[] $comments */ $comments = []; - $xpath = new DOMXPath( $dom ); - foreach ( $xpath->query( '//comment()' ) as $comment ) { + foreach ( $dom->xpath->query( '//comment()' ) as $comment ) { $comments[] = $comment; } $this->assertCount( 4, $comments ); From 72ae0c6644aa574b2d898d30f1f5b521a28a2d2d Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Tue, 17 Dec 2019 17:20:28 +0100 Subject: [PATCH 46/67] Provide a way to retrieve the Dom\Document for a given node --- .../class-amp-validation-manager.php | 6 +--- src/Dom/Document.php | 29 +++++++++++++++++++ .../test-class-amp-validation-manager.php | 1 + 3 files changed, 31 insertions(+), 5 deletions(-) diff --git a/includes/validation/class-amp-validation-manager.php b/includes/validation/class-amp-validation-manager.php index aa71b02d2ec..5d58060bb85 100644 --- a/includes/validation/class-amp-validation-manager.php +++ b/includes/validation/class-amp-validation-manager.php @@ -1050,11 +1050,7 @@ public static function parse_source_comment( DOMComment $comment ) { * } */ public static function locate_sources( DOMNode $node ) { - $dom = $node->ownerDocument; - if ( ! $dom instanceof Document ) { - _doing_it_wrong( 'locate_sources', 'Ended up with a non-normalized DOMDocument. Use Amp\AmpWP\Dom\Document instead.', '1.5.0' ); - } - + $dom = Document::from_node( $node ); $comments = $dom->xpath->query( 'preceding::comment()[ starts-with( ., "amp-source-stack" ) or starts-with( ., "/amp-source-stack" ) ]', $node ); $sources = []; $matches = []; diff --git a/src/Dom/Document.php b/src/Dom/Document.php index 19ef58d4ad9..05462410acd 100644 --- a/src/Dom/Document.php +++ b/src/Dom/Document.php @@ -229,6 +229,35 @@ public static function from_html( $html, $encoding = null ) { return $dom; } + /** + * Named constructor to provide convenient way of retrieving the DOM from a node. + * + * @param DOMNode $node Node to retrieve the DOM from. This is being modified by reference (!). + * @return Document DOM generated from provided HTML, or false if the transformation failed. + */ + public static function from_node( DOMNode &$node ) { + $root = $node->ownerDocument; + + // If the node is the document itself, ownerDocument returns null. + if ( null === $root ) { + $root = $node; + } + + if ( $root instanceof Document ) { + return $root; + } + + $dom = new self(); + + // We replace the $node by reference, to make sure the next lines of code will + // work as expected with the new document. + // Otherwise $dom and $node would refer to two different DOMDocuments. + $node = $dom->importNode( $root->documentElement ?: $root, true ); + $dom->appendChild( $node ); + + return $dom; + } + /** * Load HTML from a string. * diff --git a/tests/php/validation/test-class-amp-validation-manager.php b/tests/php/validation/test-class-amp-validation-manager.php index f071206d5e3..c272ba0311a 100644 --- a/tests/php/validation/test-class-amp-validation-manager.php +++ b/tests/php/validation/test-class-amp-validation-manager.php @@ -93,6 +93,7 @@ public function setUp() { parent::setUp(); $dom_document = new Document( '1.0', 'utf-8' ); $this->node = $dom_document->createElement( self::TAG_NAME ); + $dom_document->appendChild( $this->node ); AMP_Validation_Manager::reset_validation_results(); $this->original_wp_registered_widgets = $GLOBALS['wp_registered_widgets']; From 18e35d28e1376d7133e93b0492fc8712748c72ea Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Tue, 17 Dec 2019 17:30:56 +0100 Subject: [PATCH 47/67] Remove Twitter shortcode tests, [tweet] handling was moved to Jetpack --- tests/php/test-amp-twitter-embed.php | 29 ---------------------------- 1 file changed, 29 deletions(-) diff --git a/tests/php/test-amp-twitter-embed.php b/tests/php/test-amp-twitter-embed.php index b453bcb7d3d..2875c796191 100644 --- a/tests/php/test-amp-twitter-embed.php +++ b/tests/php/test-amp-twitter-embed.php @@ -112,35 +112,6 @@ public function get_conversion_data() { 'https://twitter.com/robertnyman/lists/web-gdes' . PHP_EOL, "

\n", ], - - 'shortcode_without_id' => [ - '[tweet]' . PHP_EOL, - '' . PHP_EOL, - ], - 'shortcode_simple' => [ - '[tweet 987437752164737025]' . PHP_EOL, - '', - ], - 'shortcode_with_tweet_attribute' => [ - '[tweet tweet=987437752164737025]' . PHP_EOL, - '', - ], - 'shortcode_with_big_tweet_id' => [ - '[tweet 705219971425574912]' . PHP_EOL, - '', - ], - 'shortcode_with_url' => [ - '[tweet https://twitter.com/wordpress/status/987437752164737025]' . PHP_EOL, - '', - ], - 'shortcode_with_url_with_big_tweet_id' => [ - '[tweet https://twitter.com/wordpress/status/705219971425574912]' . PHP_EOL, - '', - ], - 'shortcode_with_non_numeric_tweet_id' => [ - '[tweet abcd]' . PHP_EOL, - '' . PHP_EOL, - ], ]; } From 3bd83a457eb826c4c456e724b9c6fa91ceeb40a7 Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Tue, 17 Dec 2019 19:00:54 +0100 Subject: [PATCH 48/67] Enforce consistent charset meta tag --- src/Dom/Document.php | 57 +++++++++---- tests/php/test-amp-script-sanitizer.php | 10 +-- tests/php/test-class-amp-dom-document.php | 83 ++++++++++--------- ...st-class-amp-nav-menu-toggle-sanitizer.php | 25 +++--- tests/php/test-class-amp-theme-support.php | 2 + .../php/test-tag-and-attribute-sanitizer.php | 4 +- 6 files changed, 110 insertions(+), 71 deletions(-) diff --git a/src/Dom/Document.php b/src/Dom/Document.php index 05462410acd..1c8ee2b5f7c 100644 --- a/src/Dom/Document.php +++ b/src/Dom/Document.php @@ -268,6 +268,9 @@ public static function from_node( DOMNode &$node ) { * @return bool true on success or false on failure. */ public function loadHTML( $source, $options = 0 ) { + // Drop references to old DOM document. + unset ( $this->xpath, $this->head, $this->body ); + $source = $this->convert_amp_bind_attributes( $source ); $source = $this->replace_self_closing_tags( $source ); $source = $this->normalize_document_structure( $source ); @@ -302,6 +305,10 @@ public function loadHTML( $source, $options = 0 ) { } } + // Add the required utf-8 meta charset tag. + $charset = AMP_DOM_Utils::create_node( $this, 'meta', [ 'charset' => self::AMP_ENCODING ] ); + $this->head->insertBefore( $charset, $this->head->firstChild ); + // Do some further clean-up. $this->move_invalid_head_nodes_to_body(); @@ -705,9 +712,11 @@ private function detect_and_strip_encoding( &$content ) { $encoding = self::UNKNOWN_ENCODING; // Check for HTML 4 http-equiv meta tags. - $http_equiv_tag = $this->find_tag( $content, 'meta', 'http-equiv' ); - if ( $http_equiv_tag ) { - $encoding = $this->extract_value( $http_equiv_tag, 'charset' ); + foreach ( $this->find_tags( $content, 'meta', 'http-equiv' ) as $potential_http_equiv_tag ) { + $encoding = $this->extract_value( $potential_http_equiv_tag, 'charset' ); + if ( false !== $encoding ) { + $http_equiv_tag = $potential_http_equiv_tag; + } } // Check for HTML 5 charset meta tag. This overrides the HTML 4 charset. @@ -716,15 +725,13 @@ private function detect_and_strip_encoding( &$content ) { $encoding = $this->extract_value( $charset_tag, 'charset' ); } - // Strip charset tags if they don't fit the AMP UTF-8 requirement. - if ( self::AMP_ENCODING !== strtolower( $encoding ) ) { - if ( $http_equiv_tag ) { - $content = str_replace( $http_equiv_tag, '', $content ); - } + // Strip all charset tags. + if ( isset( $http_equiv_tag ) ) { + $content = str_replace( $http_equiv_tag, '', $content ); + } - if ( $charset_tag ) { - $content = str_replace( $charset_tag, '', $content ); - } + if ( $charset_tag ) { + $content = str_replace( $charset_tag, '', $content ); } return $encoding; @@ -738,9 +745,9 @@ private function detect_and_strip_encoding( &$content ) { * @param string $content Content in which to find the tag. * @param string $element Element of the tag. * @param string $attribute Attribute that the tag contains. - * @return string|false The requested tag, or false if not found. + * @return string[] The requested tags. Returns an empty array if none found. */ - private function find_tag( $content, $element, $attribute = null ) { + private function find_tags( $content, $element, $attribute = null ) { $matches = []; $pattern = empty( $attribute ) ? sprintf( @@ -754,10 +761,30 @@ private function find_tag( $content, $element, $attribute = null ) { ); if ( preg_match( $pattern, $content, $matches ) ) { - return $matches[0]; + return $matches; } - return false; + return []; + } + + /** + * Find a given tag with a given attribute. + * + * If multiple tags match, this method will only return the first one. + * + * @param string $content Content in which to find the tag. + * @param string $element Element of the tag. + * @param string $attribute Attribute that the tag contains. + * @return string|false The requested tag, or false if not found. + */ + private function find_tag( $content, $element, $attribute = null ) { + $matches = $this->find_tags( $content, $element, $attribute ); + + if ( empty( $matches ) ) { + return false; + } + + return $matches[0]; } /** diff --git a/tests/php/test-amp-script-sanitizer.php b/tests/php/test-amp-script-sanitizer.php index ac9b295216c..5f16b06b72f 100644 --- a/tests/php/test-amp-script-sanitizer.php +++ b/tests/php/test-amp-script-sanitizer.php @@ -23,22 +23,22 @@ public function get_noscript_data() { return [ 'document_write' => [ 'Has script? ', - 'Has script? Nope!', + 'Has script? Nope!', ], 'nested_elements' => [ '', - 'before middle end', + 'before middle end', ], 'head_noscript_style' => [ '', - '', + '', ], 'head_noscript_span' => [ '', - 'No script', + 'No script', ], 'test_with_dev_mode' => [ - '', + '', null, ], ]; diff --git a/tests/php/test-class-amp-dom-document.php b/tests/php/test-class-amp-dom-document.php index 2ceeee21410..8bb0cf76dec 100644 --- a/tests/php/test-class-amp-dom-document.php +++ b/tests/php/test-class-amp-dom-document.php @@ -20,112 +20,114 @@ class Test_AMP_DOM_Document extends WP_UnitTestCase { * @return array Data. */ public function data_dom_document() { + $head = ''; + return [ 'minimum_valid_document' => [ 'utf-8', '', - '', + '' . $head . '', ], 'valid_document_with_attributes' => [ 'utf-8', - '

Text

', - '

Text

', + '' . $head . '

Text

', + '' . $head . '

Text

', ], 'missing_head' => [ 'utf-8', '

Text

', - '

Text

', + '' . $head . '

Text

', ], 'missing_body' => [ 'utf-8', - '

Text

', - '

Text

', + '' . $head . '

Text

', + '' . $head . '

Text

', ], 'missing_head_and_body' => [ 'utf-8', '

Text

', - '

Text

', + '' . $head . '

Text

', ], 'missing_html_and_head_and_body' => [ 'utf-8', '

Text

', - '

Text

', + '' . $head . '

Text

', ], 'content_only' => [ 'utf-8', '

Text

', - '

Text

', + '' . $head . '

Text

', ], 'missing_doctype' => [ 'utf-8', - '

Text

', - '

Text

', + '' . $head . '

Text

', + '' . $head . '

Text

', ], 'html_4_doctype' => [ 'utf-8', - '

Text

', - '

Text

', + '' . $head . '

Text

', + '' . $head . '

Text

', ], 'lots_of_whitespace' => [ 'utf-8', " \n \n \n \n \n \n \n

\n Text \n

\n\n \n \n ", - '

Text

', + '' . $head . '

Text

', ], 'utf_8_encoding_predefined' => [ 'utf-8', '

مرحبا بالعالم! Check out ‘this’ and “that” and—other things.

', - '

مرحبا بالعالم! Check out ‘this’ and “that” and—other things.

', + '' . $head . '

مرحبا بالعالم! Check out ‘this’ and “that” and—other things.

', ], 'utf_8_encoding_guessed_via_charset' => [ '', - '

مرحبا بالعالم! Check out ‘this’ and “that” and—other things.

', - '

مرحبا بالعالم! Check out ‘this’ and “that” and—other things.

', + '' . $head . '

مرحبا بالعالم! Check out ‘this’ and “that” and—other things.

', + '' . $head . '

مرحبا بالعالم! Check out ‘this’ and “that” and—other things.

', ], 'utf_8_encoding_guessed_via_content' => [ '', '

مرحبا بالعالم! Check out ‘this’ and “that” and—other things.

', - '

مرحبا بالعالم! Check out ‘this’ and “that” and—other things.

', + '' . $head . '

مرحبا بالعالم! Check out ‘this’ and “that” and—other things.

', ], 'iso_8859_1_encoding_predefined' => [ 'iso-8859-1', utf8_decode( '

ÄÖÜ

' ), - '

ÄÖÜ

', + '' . $head . '

ÄÖÜ

', ], 'iso_8859_1_encoding_guessed_via_charset' => [ '', utf8_decode( '

ÄÖÜ

' ), - '

ÄÖÜ

', + '' . $head . '

ÄÖÜ

', ], 'iso_8859_1_encoding_guessed_via_content' => [ '', utf8_decode( '

ÄÖÜ

' ), - '

ÄÖÜ

', + '' . $head . '

ÄÖÜ

', ], 'raw_iso_8859_1' => [ '', utf8_decode( 'ÄÖÜ' ), - 'ÄÖÜ', + '' . $head . 'ÄÖÜ', ], // Make sure we correctly identify the ISO-8859 sub-charsets ("€" does not exist in ISO-8859-1). 'iso_8859_15_encoding_predefined' => [ 'iso-8859-1', mb_convert_encoding( '

', 'ISO-8859-15', 'UTF-8' ), - '

', + '' . $head . '

', ], 'iso_8859_15_encoding_guessed_via_charset' => [ '', mb_convert_encoding( '

', 'ISO-8859-15', 'UTF-8' ), - '

', + '' . $head . '

', ], 'iso_8859_15_encoding_guessed_via_content' => [ '', mb_convert_encoding( '

', 'ISO-8859-15', 'UTF-8' ), - '

', + '' . $head . '

', ], 'raw_iso_8859_15' => [ '', mb_convert_encoding( '€', 'ISO-8859-15', 'UTF-8' ), - '€', + '' . $head . '€', ], ]; } @@ -258,34 +260,38 @@ public function test_attribute_conversion_on_long_iframe_srcdocs( $iterations ) * @covers \Amp\AmpWP\Dom\Document::normalize_document_structure() */ public function test_ensuring_head_body() { + // The meta charset tag that is automatically added needs to always be taken into account. + $html = '

Hello

'; $dom = Document::from_html( $html ); $this->assertEquals( 'head', $dom->documentElement->firstChild->nodeName ); - $this->assertEquals( 0, $dom->documentElement->firstChild->childNodes->length ); + $this->assertEquals( 1, $dom->head->childNodes->length ); $this->assertEquals( 'body', $dom->documentElement->lastChild->nodeName ); - $this->assertEquals( $dom->documentElement->lastChild, $dom->getElementsByTagName( 'p' )->item( 0 )->parentNode ); + $this->assertEquals( $dom->body, $dom->getElementsByTagName( 'p' )->item( 0 )->parentNode ); $html = 'foo'; $dom = Document::from_html( $html ); $this->assertEquals( 'head', $dom->documentElement->firstChild->nodeName ); - $this->assertEquals( $dom->documentElement->firstChild, $dom->getElementsByTagName( 'title' )->item( 0 )->parentNode ); + $this->assertEquals( $dom->head->firstChild, $dom->getElementsByTagName( 'meta' )->item( 0 ) ); + $this->assertEquals( $dom->head->firstChild->nextSibling, $dom->getElementsByTagName( 'title' )->item( 0 ) ); $this->assertEquals( 'body', $dom->documentElement->lastChild->nodeName ); - $this->assertEquals( 0, $dom->documentElement->lastChild->childNodes->length ); + $this->assertEquals( 0, $dom->body->childNodes->length ); $html = 'foo

no body

'; $dom = Document::from_html( $html ); $this->assertEquals( 'head', $dom->documentElement->firstChild->nodeName ); - $this->assertEquals( $dom->documentElement->firstChild, $dom->getElementsByTagName( 'title' )->item( 0 )->parentNode ); + $this->assertEquals( $dom->head->firstChild, $dom->getElementsByTagName( 'meta' )->item( 0 ) ); + $this->assertEquals( $dom->head->firstChild->nextSibling, $dom->getElementsByTagName( 'title' )->item( 0 ) ); $p = $dom->getElementsByTagName( 'p' )->item( 0 ); - $this->assertEquals( $dom->documentElement->lastChild, $p->parentNode ); + $this->assertEquals( $dom->body, $p->parentNode ); $this->assertEquals( 'no body', $p->textContent ); $html = 'Hello world'; $dom = Document::from_html( $html ); $this->assertEquals( 'head', $dom->documentElement->firstChild->nodeName ); - $this->assertEquals( 0, $dom->documentElement->firstChild->childNodes->length ); + $this->assertEquals( 1, $dom->head->childNodes->length ); $this->assertEquals( 'body', $dom->documentElement->lastChild->nodeName ); - $this->assertEquals( 'Hello world', $dom->documentElement->lastChild->lastChild->textContent ); + $this->assertEquals( 'Hello world', $dom->body->lastChild->textContent ); } @@ -295,11 +301,13 @@ public function test_ensuring_head_body() { * @covers \Amp\AmpWP\Dom\Document::move_invalid_head_nodes_to_body() */ public function test_invalid_head_nodes() { + // The meta charset tag that is automatically added needs to always be taken into account. // Text node. $html = 'textend'; $dom = Document::from_html( $html ); - $this->assertNull( $dom->head->firstChild ); + $this->assertEquals( 'meta', $dom->head->firstChild->tagName ); + $this->assertNull( $dom->head->firstChild->nextSibling ); $body_first_child = $dom->body->firstChild; $this->assertInstanceOf( 'DOMElement', $body_first_child ); $this->assertEquals( 'text', $body_first_child->textContent ); @@ -307,13 +315,14 @@ public function test_invalid_head_nodes() { // Valid nodes. $html = 'a'; $dom = Document::from_html( $html ); - $this->assertEquals( 8, $dom->head->childNodes->length ); + $this->assertEquals( 9, $dom->head->childNodes->length ); $this->assertNull( $dom->body->firstChild ); // Invalid nodes. $html = '

hi

'; $dom = Document::from_html( $html ); - $this->assertNull( $dom->head->firstChild ); + $this->assertEquals( 'meta', $dom->head->firstChild->tagName ); + $this->assertNull( $dom->head->firstChild->nextSibling ); $this->assertEquals( 6, $dom->body->childNodes->length ); } } diff --git a/tests/php/test-class-amp-nav-menu-toggle-sanitizer.php b/tests/php/test-class-amp-nav-menu-toggle-sanitizer.php index 49b9839c739..d3fb7f35a13 100644 --- a/tests/php/test-class-amp-nav-menu-toggle-sanitizer.php +++ b/tests/php/test-class-amp-nav-menu-toggle-sanitizer.php @@ -26,6 +26,7 @@ public function data_converter() { $container = ''; $toggle = ''; + $head = ''; $amp_state = ''; $amp_get_container_attrs = function( $class = '', $toggle_class = 'toggled-on' ) { @@ -40,8 +41,8 @@ public function data_converter() { return [ 'container_before_toggle' => [ - '' . $container . $toggle . '', - '' . $amp_state . str_replace( '>', $amp_get_container_attrs( 'nav-menu-wrapper' ) . '>', $container ) . str_replace( '>Toggle', $amp_get_toggle_attrs() . '>Toggle', $toggle ) . '', + '' . $container . $toggle . '', + '' . $head . '' . $amp_state . str_replace( '>', $amp_get_container_attrs( 'nav-menu-wrapper' ) . '>', $container ) . str_replace( '>Toggle', $amp_get_toggle_attrs() . '>Toggle', $toggle ) . '', [ 'nav_container_id' => $container_id, 'menu_button_id' => $toggle_id, @@ -50,8 +51,8 @@ public function data_converter() { ], ], 'toggle_before_container' => [ - '' . $toggle . $container . '', - '' . str_replace( '>Toggle', $amp_get_toggle_attrs() . '>Toggle', $toggle ) . $amp_state . str_replace( '>', $amp_get_container_attrs( 'nav-menu-wrapper' ) . '>', $container ) . '', + '' . $toggle . $container . '', + '' . $head . '' . str_replace( '>Toggle', $amp_get_toggle_attrs() . '>Toggle', $toggle ) . $amp_state . str_replace( '>', $amp_get_container_attrs( 'nav-menu-wrapper' ) . '>', $container ) . '', [ 'nav_container_id' => $container_id, 'menu_button_id' => $toggle_id, @@ -60,8 +61,8 @@ public function data_converter() { ], ], 'container_is_body' => [ - '' . $container . $toggle . '', - '' . $amp_state . $container . str_replace( '>Toggle', $amp_get_toggle_attrs( '', '' ) . '>Toggle', $toggle ) . '', + '' . $container . $toggle . '', + '' . $head . '' . $amp_state . $container . str_replace( '>Toggle', $amp_get_toggle_attrs( '', '' ) . '>Toggle', $toggle ) . '', [ 'nav_container_xpath' => '//body', 'menu_button_id' => $toggle_id, @@ -69,8 +70,8 @@ public function data_converter() { ], ], 'container_is_html' => [ - '' . $container . $toggle . '', - '' . $amp_state . $container . str_replace( '>Toggle', $amp_get_toggle_attrs( '', '' ) . '>Toggle', $toggle ) . '', + '' . $container . $toggle . '', + '' . $head . '' . $amp_state . $container . str_replace( '>Toggle', $amp_get_toggle_attrs( '', '' ) . '>Toggle', $toggle ) . '', [ 'nav_container_xpath' => '//html', 'menu_button_id' => $toggle_id, @@ -78,16 +79,16 @@ public function data_converter() { ], ], 'no_container_provided' => [ - '' . $container . $toggle . '', - '' . $container . '', + '' . $container . $toggle . '', + '' . $head . '' . $container . '', [ 'menu_button_id' => $toggle_id, 'nav_container_toggle_class' => 'toggled-on', ], ], 'no_arguments_provided' => [ - '' . $container . $toggle . '', - '' . $container . $toggle . '', + '' . $container . $toggle . '', + '' . $head . '' . $container . $toggle . '', [], ], ]; diff --git a/tests/php/test-class-amp-theme-support.php b/tests/php/test-class-amp-theme-support.php index de3481eee3f..53458c6262f 100644 --- a/tests/php/test-class-amp-theme-support.php +++ b/tests/php/test-class-amp-theme-support.php @@ -1931,6 +1931,8 @@ static function ( $url ) { $this->assertNotContains( 'handle=', $sanitized_html ); $this->assertEquals( 2, substr_count( $sanitized_html, '' ) ); + var_dump( $sanitized_html ); + $ordered_contains = [ '', '', diff --git a/tests/php/test-tag-and-attribute-sanitizer.php b/tests/php/test-tag-and-attribute-sanitizer.php index b224e899e22..6943a29ba61 100644 --- a/tests/php/test-tag-and-attribute-sanitizer.php +++ b/tests/php/test-tag-and-attribute-sanitizer.php @@ -2115,7 +2115,7 @@ public function get_html_data() { ], 'bad_meta_charset' => [ 'Mojibake?', - 'Mojibake?', // Note the charset attribute is removed because it violates the attribute spec, but the entire element is not removed because charset is not mandatory. + 'Mojibake?', // Note the charset attribute is removed because it violates the attribute spec, but the entire element is not removed because charset is not mandatory. [], [ AMP_Tag_And_Attribute_Sanitizer::DISALLOWED_ATTR ], // @todo Should actually be invalid_mandatory_attribute? ], @@ -2150,7 +2150,7 @@ public function get_html_data() { ], 'head_with_duplicate_charset' => [ '

Content

', - '

Content

', + '

Content

', [], [ AMP_Tag_And_Attribute_Sanitizer::DUPLICATE_UNIQUE_TAG ], ], From 3bdc795ec8ac3f5431db6092b1e05b06ce655948 Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Tue, 17 Dec 2019 19:05:07 +0100 Subject: [PATCH 49/67] Make PHPCS happy --- src/Dom/Document.php | 2 +- tests/php/test-amp-twitter-embed.php | 14 +++++++------- tests/php/test-class-amp-theme-support.php | 2 -- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/src/Dom/Document.php b/src/Dom/Document.php index 1c8ee2b5f7c..5c9278a7660 100644 --- a/src/Dom/Document.php +++ b/src/Dom/Document.php @@ -269,7 +269,7 @@ public static function from_node( DOMNode &$node ) { */ public function loadHTML( $source, $options = 0 ) { // Drop references to old DOM document. - unset ( $this->xpath, $this->head, $this->body ); + unset( $this->xpath, $this->head, $this->body ); $source = $this->convert_amp_bind_attributes( $source ); $source = $this->replace_self_closing_tags( $source ); diff --git a/tests/php/test-amp-twitter-embed.php b/tests/php/test-amp-twitter-embed.php index 2875c796191..182905db9ad 100644 --- a/tests/php/test-amp-twitter-embed.php +++ b/tests/php/test-amp-twitter-embed.php @@ -84,31 +84,31 @@ public function mock_http_request( $preempt, $r, $url ) { */ public function get_conversion_data() { return [ - 'no_embed' => [ + 'no_embed' => [ '

Hello world.

', '

Hello world.

' . PHP_EOL, ], - 'url_simple' => [ + 'url_simple' => [ 'https://twitter.com/wordpress/status/987437752164737025' . PHP_EOL, "
\n

Celebrate the WordPress 15th Anniversary on May 27 https://t.co/jv62WkI9lr pic.twitter.com/4ZECodSK78

\n

— WordPress (@WordPress) April 20, 2018

\n\n", ], - 'url_with_big_tweet_id' => [ + 'url_with_big_tweet_id' => [ 'https://twitter.com/wordpress/status/705219971425574912' . PHP_EOL, "
\n

On our way to the #GoogleDance! #SMX 💃🏻 pic.twitter.com/N8kZ9M3eN4

\n

— Search Engine Land (@sengineland) March 3, 2016

\n\n", ], - 'timeline_url_with_profile' => [ + 'timeline_url_with_profile' => [ 'https://twitter.com/wordpress' . PHP_EOL, "

\n", ], - 'timeline_url_with_likes' => [ + 'timeline_url_with_likes' => [ 'https://twitter.com/wordpress/likes' . PHP_EOL, "

\n", ], - 'timeline_url_with_list' => [ + 'timeline_url_with_list' => [ 'https://twitter.com/wordpress/lists/random_list' . PHP_EOL, "

\n", ], - 'timeline_url_with_list2' => [ + 'timeline_url_with_list2' => [ 'https://twitter.com/robertnyman/lists/web-gdes' . PHP_EOL, "

\n", ], diff --git a/tests/php/test-class-amp-theme-support.php b/tests/php/test-class-amp-theme-support.php index 53458c6262f..de3481eee3f 100644 --- a/tests/php/test-class-amp-theme-support.php +++ b/tests/php/test-class-amp-theme-support.php @@ -1931,8 +1931,6 @@ static function ( $url ) { $this->assertNotContains( 'handle=', $sanitized_html ); $this->assertEquals( 2, substr_count( $sanitized_html, '' ) ); - var_dump( $sanitized_html ); - $ordered_contains = [ '', '', From f4c1013682d209c1cbf4e0c4fec0fd2e88f64dcf Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Wed, 18 Dec 2019 17:11:42 +0100 Subject: [PATCH 50/67] Fix tests broken from rebase again --- tests/php/test-tag-and-attribute-sanitizer.php | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/php/test-tag-and-attribute-sanitizer.php b/tests/php/test-tag-and-attribute-sanitizer.php index 6943a29ba61..b8069f5ecf9 100644 --- a/tests/php/test-tag-and-attribute-sanitizer.php +++ b/tests/php/test-tag-and-attribute-sanitizer.php @@ -2115,9 +2115,7 @@ public function get_html_data() { ], 'bad_meta_charset' => [ 'Mojibake?', - 'Mojibake?', // Note the charset attribute is removed because it violates the attribute spec, but the entire element is not removed because charset is not mandatory. - [], - [ AMP_Tag_And_Attribute_Sanitizer::DISALLOWED_ATTR ], // @todo Should actually be invalid_mandatory_attribute? + 'Mojibake?', // Note the charset attribute is removed because it violates the attribute spec, but the entire element is not removed because charset is not mandatory. ], 'bad_meta_viewport' => [ '', @@ -2357,7 +2355,7 @@ public function test_sanitize_body_only() { $source = 'Hello'; $expected = 'Hello'; - $dom = AMP_DOM_Utils::get_dom( $source ); + $dom = Document::from_html( $source ); $actual_errors = []; $sanitizer = new AMP_Tag_And_Attribute_Sanitizer( $dom, @@ -2396,7 +2394,7 @@ public function test_is_missing_mandatory_attribute() { ], 'noloading' => [], ]; - $dom = new DOMDocument(); + $dom = new Document(); $node = new DOMElement( 'amp-gist' ); $dom->appendChild( $node ); $sanitizer = new AMP_Tag_And_Attribute_Sanitizer( $dom ); From 9a57bc2e744e96a221697fe0a570618210a35072 Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Wed, 18 Dec 2019 17:56:31 +0100 Subject: [PATCH 51/67] Avoid cloning the dom in test --- tests/php/test-amp-style-sanitizer.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/php/test-amp-style-sanitizer.php b/tests/php/test-amp-style-sanitizer.php index 3db2c283785..4a54009b8f9 100644 --- a/tests/php/test-amp-style-sanitizer.php +++ b/tests/php/test-amp-style-sanitizer.php @@ -2406,7 +2406,7 @@ public function test_prioritized_stylesheets( $html_generator, $assert ) { $html = $html_generator(); $original_dom = Document::from_html( $html ); - $amphtml_dom = clone $original_dom; + $amphtml_dom = Document::from_html( $html ); $error_codes = []; $args = [ From 19334a0d9bbf6a4436acdc330ab4aed0858c070d Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Wed, 18 Dec 2019 18:03:03 +0100 Subject: [PATCH 52/67] Remove unneeded test --- tests/php/test-class-amp-theme-support.php | 25 ---------------------- 1 file changed, 25 deletions(-) diff --git a/tests/php/test-class-amp-theme-support.php b/tests/php/test-class-amp-theme-support.php index de3481eee3f..2efcf8a44e0 100644 --- a/tests/php/test-class-amp-theme-support.php +++ b/tests/php/test-class-amp-theme-support.php @@ -2333,31 +2333,6 @@ public function test_prepare_response_varying_html() { $this->assertContains( '', $output ); } - /** - * Test prepare_response to inject html[amp] attribute and ensure HTML5 doctype. - * - * @covers AMP_Theme_Support::prepare_response() - */ - public function test_prepare_response_to_add_html5_doctype_and_amp_attribute() { - wp_scripts(); - wp(); - add_filter( 'amp_validation_error_sanitized', '__return_true' ); - add_theme_support( AMP_Theme_Support::SLUG ); - AMP_Theme_Support::init(); - AMP_Theme_Support::finish_init(); - ob_start(); - ?> - - - assertStringStartsWith( '', $sanitized_html ); - $this->assertContains( 'assertContains( '', $sanitized_html ); - } - /** * Test prepare_response will cache redirects when validation errors happen. * From 2010c49334786be2476aa213701205d58433b2af Mon Sep 17 00:00:00 2001 From: Weston Ruter Date: Wed, 18 Dec 2019 10:03:19 -0800 Subject: [PATCH 53/67] Fix reference to xpath_from_css_selector method --- includes/sanitizers/class-amp-core-theme-sanitizer.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/includes/sanitizers/class-amp-core-theme-sanitizer.php b/includes/sanitizers/class-amp-core-theme-sanitizer.php index 336cbe6ef96..660ad8a67ab 100644 --- a/includes/sanitizers/class-amp-core-theme-sanitizer.php +++ b/includes/sanitizers/class-amp-core-theme-sanitizer.php @@ -1715,7 +1715,7 @@ public function add_twentytwenty_toggles() { if ( 'next' === $toggle_target ) { $target_node = $toggle->nextSibling; } else { - $target_xpath = $this->dom->xpath_from_css_selector( $toggle_target ); + $target_xpath = $this->xpath_from_css_selector( $toggle_target ); if ( null === $target_xpath ) { continue; } @@ -1773,7 +1773,7 @@ public function add_twentytwenty_toggles() { $focus_selector = $toggle->getAttribute( 'data-set-focus' ); if ( ! empty( $focus_selector ) ) { - $focus_xpath = $this->dom->xpath_from_css_selector( $focus_selector ); + $focus_xpath = $this->xpath_from_css_selector( $focus_selector ); $focus_element = $this->dom->xpath->query( $focus_xpath )->item( 0 ); if ( $focus_element instanceof DOMElement ) { From 01848c7036dfb7f1d06892a74231b5600abfb91b Mon Sep 17 00:00:00 2001 From: Weston Ruter Date: Wed, 18 Dec 2019 10:19:51 -0800 Subject: [PATCH 54/67] Eliminate usage of deprecated AMP_DOM_Utils::AMP_BIND_DATA_ATTR_PREFIX --- .../class-amp-comments-sanitizer.php | 4 +++- .../class-amp-core-theme-sanitizer.php | 24 ++++++++++--------- .../class-amp-nav-menu-toggle-sanitizer.php | 6 ++--- .../sanitizers/class-amp-style-sanitizer.php | 2 +- .../class-amp-tag-and-attribute-sanitizer.php | 4 ++-- 5 files changed, 22 insertions(+), 18 deletions(-) diff --git a/includes/sanitizers/class-amp-comments-sanitizer.php b/includes/sanitizers/class-amp-comments-sanitizer.php index 64e7f664671..d40353a63f6 100644 --- a/includes/sanitizers/class-amp-comments-sanitizer.php +++ b/includes/sanitizers/class-amp-comments-sanitizer.php @@ -5,6 +5,8 @@ * @package AMP */ +use Amp\AmpWP\Dom\Document; + /** * Class AMP_Comments_Sanitizer * @@ -109,7 +111,7 @@ protected function process_comment_form( $comment_form ) { } } - $amp_bind_attr_format = AMP_DOM_Utils::AMP_BIND_DATA_ATTR_PREFIX . '%s'; + $amp_bind_attr_format = Document::AMP_BIND_DATA_ATTR_PREFIX . '%s'; foreach ( $form_fields as $name => $form_field ) { foreach ( $form_field as $element ) { diff --git a/includes/sanitizers/class-amp-core-theme-sanitizer.php b/includes/sanitizers/class-amp-core-theme-sanitizer.php index 660ad8a67ab..43c74bc9bff 100644 --- a/includes/sanitizers/class-amp-core-theme-sanitizer.php +++ b/includes/sanitizers/class-amp-core-theme-sanitizer.php @@ -6,6 +6,8 @@ * @since 1.0 */ +use Amp\AmpWP\Dom\Document; + /** * Class AMP_Core_Theme_Sanitizer * @@ -1396,12 +1398,12 @@ public function add_twentyfourteen_slider_carousel() { $amp_carousel_desktop_id = 'twentyFourteenSliderDesktop'; $amp_carousel_mobile_id = 'twentyFourteenSliderMobile'; $amp_carousel_attributes = [ - 'layout' => 'responsive', - 'on' => "slideChange:AMP.setState( { $selected_slide_state_id: event.index } )", - 'width' => '100', - 'type' => 'slides', - 'loop' => '', - AMP_DOM_Utils::AMP_BIND_DATA_ATTR_PREFIX . 'slide' => $selected_slide_state_id, + 'layout' => 'responsive', + 'on' => "slideChange:AMP.setState( { $selected_slide_state_id: event.index } )", + 'width' => '100', + 'type' => 'slides', + 'loop' => '', + Document::AMP_BIND_DATA_ATTR_PREFIX . 'slide' => $selected_slide_state_id, ]; $amp_carousel_desktop = AMP_DOM_Utils::create_node( $this->dom, @@ -1449,7 +1451,7 @@ public function add_twentyfourteen_slider_carousel() { $li->setAttribute( 'selected', '' ); $a->setAttribute( 'class', 'slider-active' ); } - $a->setAttribute( AMP_DOM_Utils::AMP_BIND_DATA_ATTR_PREFIX . 'class', "$selected_slide_state_id == $i ? 'slider-active' : ''" ); + $a->setAttribute( Document::AMP_BIND_DATA_ATTR_PREFIX . 'class', "$selected_slide_state_id == $i ? 'slider-active' : ''" ); $a->setAttribute( 'role', 'button' ); $a->setAttribute( 'on', "tap:AMP.setState( { $selected_slide_state_id: $i } )" ); $li->setAttribute( 'option', (string) $i ); @@ -1500,9 +1502,9 @@ public function add_twentyfourteen_search() { // Set visibility and aria-expanded based of the link based on whether the search bar is expanded. $search_toggle_link->setAttribute( 'aria-expanded', wp_json_encode( $hidden ) ); - $search_toggle_link->setAttribute( AMP_DOM_Utils::AMP_BIND_DATA_ATTR_PREFIX . 'aria-expanded', "$hidden_state_id ? 'false' : 'true'" ); - $search_toggle_div->setAttribute( AMP_DOM_Utils::AMP_BIND_DATA_ATTR_PREFIX . 'class', "$hidden_state_id ? 'search-toggle' : 'search-toggle active'" ); - $search_container->setAttribute( AMP_DOM_Utils::AMP_BIND_DATA_ATTR_PREFIX . 'class', "$hidden_state_id ? 'search-box-wrapper hide' : 'search-box-wrapper'" ); + $search_toggle_link->setAttribute( Document::AMP_BIND_DATA_ATTR_PREFIX . 'aria-expanded', "$hidden_state_id ? 'false' : 'true'" ); + $search_toggle_div->setAttribute( Document::AMP_BIND_DATA_ATTR_PREFIX . 'class', "$hidden_state_id ? 'search-toggle' : 'search-toggle active'" ); + $search_container->setAttribute( Document::AMP_BIND_DATA_ATTR_PREFIX . 'class', "$hidden_state_id ? 'search-box-wrapper hide' : 'search-box-wrapper'" ); } /** @@ -1612,7 +1614,7 @@ public function wrap_modal_in_lightbox( $args = [] ) { } /** - * Add generic modal interactivity compat for the Twentytwenty theme. + * Add generic modal interactivity compat for the Twenty Twenty theme. * * Modals implemented in JS will be transformed into equivalents, * with the tap actions being attached to their associated toggles. diff --git a/includes/sanitizers/class-amp-nav-menu-toggle-sanitizer.php b/includes/sanitizers/class-amp-nav-menu-toggle-sanitizer.php index e306be9eb76..9893ff740f0 100644 --- a/includes/sanitizers/class-amp-nav-menu-toggle-sanitizer.php +++ b/includes/sanitizers/class-amp-nav-menu-toggle-sanitizer.php @@ -77,7 +77,7 @@ public function sanitize() { if ( ! empty( $this->args['nav_container_toggle_class'] ) ) { $nav_el->setAttribute( - AMP_DOM_Utils::AMP_BIND_DATA_ATTR_PREFIX . 'class', + Document::AMP_BIND_DATA_ATTR_PREFIX . 'class', sprintf( "%s + ( $state_id ? %s : '' )", wp_json_encode( $nav_el->getAttribute( 'class' ) ), @@ -103,10 +103,10 @@ public function sanitize() { $button_on = sprintf( "tap:AMP.setState({ $state_id: ! $state_id })" ); $button_el->setAttribute( 'on', $button_on ); $button_el->setAttribute( 'aria-expanded', 'false' ); - $button_el->setAttribute( AMP_DOM_Utils::AMP_BIND_DATA_ATTR_PREFIX . 'aria-expanded', "$state_id ? 'true' : 'false'" ); + $button_el->setAttribute( Document::AMP_BIND_DATA_ATTR_PREFIX . 'aria-expanded', "$state_id ? 'true' : 'false'" ); if ( ! empty( $this->args['menu_button_toggle_class'] ) ) { $button_el->setAttribute( - AMP_DOM_Utils::AMP_BIND_DATA_ATTR_PREFIX . 'class', + Document::AMP_BIND_DATA_ATTR_PREFIX . 'class', sprintf( "%s + ( $state_id ? %s : '' )", wp_json_encode( $button_el->getAttribute( 'class' ) ), wp_json_encode( ' ' . $this->args['menu_button_toggle_class'] ) ) ); } diff --git a/includes/sanitizers/class-amp-style-sanitizer.php b/includes/sanitizers/class-amp-style-sanitizer.php index 5fbe039602a..6ec82dea9fc 100644 --- a/includes/sanitizers/class-amp-style-sanitizer.php +++ b/includes/sanitizers/class-amp-style-sanitizer.php @@ -476,7 +476,7 @@ private function get_used_class_names() { } // Find all [class] attributes and capture the contents of any single- or double-quoted strings. - foreach ( $this->dom->xpath->query( '//*/@' . AMP_DOM_Utils::AMP_BIND_DATA_ATTR_PREFIX . 'class' ) as $bound_class_attribute ) { + foreach ( $this->dom->xpath->query( '//*/@' . Document::AMP_BIND_DATA_ATTR_PREFIX . 'class' ) as $bound_class_attribute ) { if ( preg_match_all( '/([\'"])([^\1]*?)\1/', $bound_class_attribute->nodeValue, $matches ) ) { $classes .= ' ' . implode( ' ', $matches[2] ); } diff --git a/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php b/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php index 123c1e13e78..cbbf5dc6cbd 100644 --- a/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php +++ b/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php @@ -738,7 +738,7 @@ static function ( $validation_error ) { // Check if element needs amp-bind component. if ( $node instanceof DOMElement && ! in_array( 'amp-bind', $this->script_components, true ) ) { foreach ( $node->attributes as $name => $value ) { - if ( AMP_DOM_Utils::AMP_BIND_DATA_ATTR_PREFIX === substr( $name, 0, 14 ) ) { + if ( Document::AMP_BIND_DATA_ATTR_PREFIX === substr( $name, 0, 14 ) ) { $script_components[] = 'amp-bind'; break; } @@ -1703,7 +1703,7 @@ private function is_amp_allowed_attribute( DOMAttr $attr_node, $attr_spec_list ) if ( isset( $attr_spec_list[ $attr_name ] ) || - ( 'data-' === substr( $attr_name, 0, 5 ) && AMP_DOM_Utils::AMP_BIND_DATA_ATTR_PREFIX !== substr( $attr_name, 0, 14 ) ) + ( 'data-' === substr( $attr_name, 0, 5 ) && Document::AMP_BIND_DATA_ATTR_PREFIX !== substr( $attr_name, 0, 14 ) ) || // Allow the 'amp' or '⚡' attribute in , like . ( 'html' === $attr_node->parentNode->nodeName && in_array( $attr_node->nodeName, [ 'amp', '⚡' ], true ) ) From e763fe3a1c329e6ed6cc99bffbb0c85c051305a6 Mon Sep 17 00:00:00 2001 From: Weston Ruter Date: Wed, 18 Dec 2019 12:37:28 -0800 Subject: [PATCH 55/67] Harden type checking for XPath queries --- .../sanitizers/class-amp-nav-menu-toggle-sanitizer.php | 10 ++++++++-- includes/sanitizers/class-amp-o2-player-sanitizer.php | 5 +++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/includes/sanitizers/class-amp-nav-menu-toggle-sanitizer.php b/includes/sanitizers/class-amp-nav-menu-toggle-sanitizer.php index 9893ff740f0..ccebe82187c 100644 --- a/includes/sanitizers/class-amp-nav-menu-toggle-sanitizer.php +++ b/includes/sanitizers/class-amp-nav-menu-toggle-sanitizer.php @@ -125,7 +125,10 @@ protected function get_nav_container() { } if ( ! empty( $this->args['nav_container_xpath'] ) ) { - return $this->dom->xpath->query( $this->args['nav_container_xpath'] )->item( 0 ); + $node = $this->dom->xpath->query( $this->args['nav_container_xpath'] )->item( 0 ); + if ( $node instanceof DOMElement ) { + return $node; + } } return null; @@ -144,7 +147,10 @@ protected function get_menu_button() { } if ( ! empty( $this->args['menu_button_xpath'] ) ) { - return $this->dom->xpath->query( $this->args['menu_button_xpath'] )->item( 0 ); + $node = $this->dom->xpath->query( $this->args['menu_button_xpath'] )->item( 0 ); + if ( $node instanceof DOMElement ) { + return $node; + } } return null; diff --git a/includes/sanitizers/class-amp-o2-player-sanitizer.php b/includes/sanitizers/class-amp-o2-player-sanitizer.php index a62cadcaa8d..2669b5dfc76 100644 --- a/includes/sanitizers/class-amp-o2-player-sanitizer.php +++ b/includes/sanitizers/class-amp-o2-player-sanitizer.php @@ -75,8 +75,9 @@ public function sanitize() { for ( $i = $num_nodes - 1; $i >= 0; $i-- ) { $node = $nodes->item( $i ); - - $this->create_amp_o2_player( $this->dom, $node ); + if ( $node instanceof DOMElement ) { + $this->create_amp_o2_player( $this->dom, $node ); + } } } From cc4badbbc3de5c0be31bb9b05f512ce5246d8a82 Mon Sep 17 00:00:00 2001 From: Weston Ruter Date: Wed, 18 Dec 2019 12:38:14 -0800 Subject: [PATCH 56/67] Improve phpdoc --- .../class-amp-tag-and-attribute-sanitizer.php | 2 +- .../class-amp-validation-manager.php | 6 ++--- .../php/test-class-amp-comments-sanitizer.php | 5 +++++ .../test-class-amp-core-theme-sanitizer.php | 7 ++++++ tests/php/test-class-amp-dom-utils.php | 22 +++++++++++++++++++ ...st-class-amp-nav-menu-toggle-sanitizer.php | 1 - 6 files changed, 38 insertions(+), 5 deletions(-) diff --git a/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php b/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php index cbbf5dc6cbd..43284117bdf 100644 --- a/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php +++ b/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php @@ -1389,7 +1389,7 @@ private function check_attr_spec_rule_valid_url( DOMElement $node, $attr_name, $ foreach ( $this->extract_attribute_urls( $node->getAttributeNode( $attr_name ) ) as $url ) { $url = urldecode( $url ); - // Check whether the URL is parseable. + // Check whether the URL is parsable. $parts = wp_parse_url( $url ); if ( false === $parts ) { return AMP_Rule_Spec::FAIL; diff --git a/includes/validation/class-amp-validation-manager.php b/includes/validation/class-amp-validation-manager.php index 5d58060bb85..4986b6fb5ef 100644 --- a/includes/validation/class-amp-validation-manager.php +++ b/includes/validation/class-amp-validation-manager.php @@ -29,7 +29,7 @@ class AMP_Validation_Manager { const VALIDATION_ERRORS_QUERY_VAR = 'amp_validation_errors'; /** - * Action name for previewing the status change for invaliud markup. + * Action name for previewing the status change for invalid markup. * * @var string */ @@ -541,7 +541,7 @@ function() { /** * Override validation error statuses (when requested). * - * When a query var is present along with the required nonce, override the status of the status of the invalud markup + * When a query var is present along with the required nonce, override the status of the status of the invalid markup * as requested. * * @since 1.5.0 @@ -1284,7 +1284,7 @@ public static function wrap_hook_callbacks( $hook ) { /** * Reflection. * - * @var ReflectionFunctionAbstract $reflection + * @var ReflectionFunction|ReflectionMethod $reflection */ $reflection = $source['reflection']; unset( $source['reflection'] ); // Omit from stored source. diff --git a/tests/php/test-class-amp-comments-sanitizer.php b/tests/php/test-class-amp-comments-sanitizer.php index 67cf94db0ad..3ff2084e6a2 100644 --- a/tests/php/test-class-amp-comments-sanitizer.php +++ b/tests/php/test-class-amp-comments-sanitizer.php @@ -97,6 +97,11 @@ public function test_process_comment_form() { $this->assertContains( $name, $amp_state->nodeValue ); } foreach ( $form->getElementsByTagName( 'input' ) as $input ) { + /** + * Input. + * + * @var DOMElement $input + */ $on = $input->getAttribute( 'on' ); $this->assertContains( 'change:AMP.setState(', $on ); $this->assertContains( strval( $GLOBALS['post']->ID ), $on ); diff --git a/tests/php/test-class-amp-core-theme-sanitizer.php b/tests/php/test-class-amp-core-theme-sanitizer.php index af626d21c83..a5c93ed3fd8 100644 --- a/tests/php/test-class-amp-core-theme-sanitizer.php +++ b/tests/php/test-class-amp-core-theme-sanitizer.php @@ -39,6 +39,9 @@ public function get_xpath_from_css_selector_data() { * * @dataProvider get_xpath_from_css_selector_data * @covers AMP_Core_Theme_Sanitizer::xpath_from_css_selector() + * + * @param string $css_selector CSS Selector. + * @param string $expected Expected XPath expression. */ public function test_xpath_from_css_selector( $css_selector, $expected ) { $dom = new Document(); @@ -90,6 +93,10 @@ public function get_get_closest_submenu_data() { * * @dataProvider get_get_closest_submenu_data * @covers AMP_Core_Theme_Sanitizer::get_closest_submenu() + * + * @param Document $dom Document. + * @param DOMElement $element Element. + * @param DOMElement $expected Expected element. */ public function test_get_closest_submenu( $dom, $element, $expected ) { $sanitizer = new AMP_Core_Theme_Sanitizer( $dom ); diff --git a/tests/php/test-class-amp-dom-utils.php b/tests/php/test-class-amp-dom-utils.php index 15b2e179597..99e0d29aabb 100644 --- a/tests/php/test-class-amp-dom-utils.php +++ b/tests/php/test-class-amp-dom-utils.php @@ -441,6 +441,10 @@ public function get_has_class_data() { * * @dataProvider get_has_class_data * @covers \AMP_DOM_Utils::has_class() + * + * @param DOMElement $element Element. + * @param string $class Class names. + * @param bool $expected Expected has class name. */ public function test_has_class( DOMElement $element, $class, $expected ) { $actual = AMP_DOM_Utils::has_class( $element, $class ); @@ -479,6 +483,10 @@ public function get_get_element_id_data() { * * @dataProvider get_get_element_id_data * @covers \AMP_DOM_Utils::get_element_id() + * + * @param DOMElement $element Element. + * @param string $prefix Prefix. + * @param string $expected Expected element ID. */ public function test_get_element_id( DOMElement $element, $prefix, $expected ) { $actual = AMP_DOM_Utils::get_element_id( $element, $prefix ); @@ -517,6 +525,11 @@ public function get_add_amp_action_data() { * * @dataProvider get_add_amp_action_data * @covers \AMP_DOM_Utils::add_amp_action() + * + * @param DOMElement $element Element. + * @param string $event Event. + * @param string $action Action. + * @param string $expected Expected. */ public function test_add_amp_action( DOMElement $element, $event, $action, $expected ) { AMP_DOM_Utils::add_amp_action( $element, $event, $action ); @@ -552,6 +565,10 @@ public function get_merge_amp_actions_data() { * * @dataProvider get_merge_amp_actions_data * @covers \AMP_DOM_Utils::merge_amp_actions() + * + * @param string $first First action. + * @param string $second Second action. + * @param string $expected Expected merged actions. */ public function test_merge_amp_actions( $first, $second, $expected ) { $actual = AMP_DOM_Utils::merge_amp_actions( $first, $second ); @@ -739,6 +756,11 @@ public function get_copy_attributes_data() { * * @dataProvider get_copy_attributes_data * @covers \AMP_DOM_Utils::copy_attributes() + * + * @param array $attributes Attributes. + * @param DOMElement $from From element. + * @param DOMElement $to To element. + * @param array $expected Expected. */ public function test_copy_attributes( $attributes, DOMElement $from, DOMElement $to, $expected ) { AMP_DOM_Utils::copy_attributes( $attributes, $from, $to ); diff --git a/tests/php/test-class-amp-nav-menu-toggle-sanitizer.php b/tests/php/test-class-amp-nav-menu-toggle-sanitizer.php index d3fb7f35a13..22c5e392550 100644 --- a/tests/php/test-class-amp-nav-menu-toggle-sanitizer.php +++ b/tests/php/test-class-amp-nav-menu-toggle-sanitizer.php @@ -11,7 +11,6 @@ * Tests for AMP_Nav_Menu_Toggle_Sanitizer. * * @covers AMP_Nav_Menu_Toggle_Sanitizer - * @group testtt */ class Test_AMP_Nav_Menu_Toggle_Sanitizer extends WP_UnitTestCase { From 8cbe6d60d811818079ea846c70b367af1e932f66 Mon Sep 17 00:00:00 2001 From: Weston Ruter Date: Wed, 18 Dec 2019 14:03:54 -0800 Subject: [PATCH 57/67] Remove unused DEFAULT_ARGS from Meta sanitizer; add phpdoc and todo comment --- includes/sanitizers/class-amp-meta-sanitizer.php | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/includes/sanitizers/class-amp-meta-sanitizer.php b/includes/sanitizers/class-amp-meta-sanitizer.php index a74dd563e56..5628ef3f320 100644 --- a/includes/sanitizers/class-amp-meta-sanitizer.php +++ b/includes/sanitizers/class-amp-meta-sanitizer.php @@ -21,15 +21,6 @@ class AMP_Meta_Sanitizer extends AMP_Base_Sanitizer { */ public static $tag = 'meta'; - /** - * Placeholder for default arguments, to be set in child classes. - * - * @var array - */ - protected $DEFAULT_ARGS = [ // phpcs:ignore WordPress.NamingConventions.ValidVariableName.PropertyNotSnakeCase - 'use_document_element' => true, // We want to work on the header, so we need the entire document. - ]; - /* * Tags array keys. */ @@ -139,11 +130,17 @@ protected function ensure_viewport_is_present() { } // Ensure we have the 'width=device-width' setting included. + /** + * Viewport tag. + * + * @var DOMElement $viewport_tag + */ $viewport_tag = $this->meta_tags[ self::TAG_VIEWPORT ][0]; $viewport_content = $viewport_tag->getAttribute( 'content' ); $viewport_settings = array_map( 'trim', explode( ',', $viewport_content ) ); $width_found = false; + // @todo The invalid/missing properties should raise validation errors. See . foreach ( $viewport_settings as $index => $viewport_setting ) { list( $property, $value ) = array_map( 'trim', explode( '=', $viewport_setting ) ); if ( 'width' === $property ) { From fbd790f1092531641cb13a63bfe835df1f9de705 Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Thu, 19 Dec 2019 07:07:50 +0100 Subject: [PATCH 58/67] Apply suggestions from code review Co-Authored-By: Weston Ruter --- includes/sanitizers/class-amp-style-sanitizer.php | 2 +- includes/utils/class-amp-dom-utils.php | 6 +++--- src/Dom/Document.php | 3 +-- tests/php/test-class-amp-dom-document.php | 1 - 4 files changed, 5 insertions(+), 7 deletions(-) diff --git a/includes/sanitizers/class-amp-style-sanitizer.php b/includes/sanitizers/class-amp-style-sanitizer.php index 6ec82dea9fc..9bcc886e3f3 100644 --- a/includes/sanitizers/class-amp-style-sanitizer.php +++ b/includes/sanitizers/class-amp-style-sanitizer.php @@ -2795,7 +2795,7 @@ private function remove_admin_bar_if_css_excluded() { if ( ! $included ) { // Remove admin-bar class from body element. // @todo It would be nice if any style rules which refer to .admin-bar could also be removed, but this would mean retroactively going back over the CSS again and re-shaking it. - if ( $this->dom->body instanceof DOMElement && $this->dom->body->hasAttribute( 'class' ) ) { + if ( $this->dom->body->hasAttribute( 'class' ) ) { $this->dom->body->setAttribute( 'class', preg_replace( '/(^|\s)admin-bar(\s|$)/', ' ', $this->dom->body->getAttribute( 'class' ) ) diff --git a/includes/utils/class-amp-dom-utils.php b/includes/utils/class-amp-dom-utils.php index 09e0e521416..2268599f8c1 100644 --- a/includes/utils/class-amp-dom-utils.php +++ b/includes/utils/class-amp-dom-utils.php @@ -72,14 +72,14 @@ class AMP_DOM_Utils { * * @since 0.7 * @see AMP_DOM_Utils::get_content_from_dom_node() - * @deprecated Use Dom\Document::from_html( $html, $encoding ) instead. + * @deprecated Use Amp\AmpWP\Dom\Document::from_html( $html, $encoding ) instead. * * @param string $document Valid HTML document to be represented by a Dom\Document. * @param string $encoding Optional. Encoding to use for the content. * @return Document|false Returns Dom\Document, or false if conversion failed. */ public static function get_dom( $document, $encoding = null ) { - _deprecated_function( __METHOD__, '1.5.0', 'Dom\Document::from_html()' ); + _deprecated_function( __METHOD__, '1.5.0', 'Amp\AmpWP\Dom\Document::from_html()' ); return Document::from_html( $document, $encoding ); } @@ -219,7 +219,7 @@ public static function get_content_from_dom( Document $dom ) { * @return string Returns the HTML content represented in the DOMNode */ public static function get_content_from_dom_node( Document $dom, $node ) { - _deprecated_function( __METHOD__, '1.5.0', 'Dom\Document->saveHtml( $node )' ); + _deprecated_function( __METHOD__, '1.5.0', 'Amp\AmpWP\Dom\Document::saveHtml()' ); return $dom->saveHTML( $node ); } diff --git a/src/Dom/Document.php b/src/Dom/Document.php index 5c9278a7660..418bf0fcab4 100644 --- a/src/Dom/Document.php +++ b/src/Dom/Document.php @@ -119,7 +119,6 @@ final class Document extends DOMDocument { const HTML_FIND_TAG_WITH_ATTRIBUTE_PATTERN = '/<%1$s [^>]*?\s*%2$s=[^>]*?>[^<]*(?:<\/%1$s>)?/i'; const HTML_EXTRACT_ATTRIBUTE_VALUE_PATTERN = '/%s=(?:([\'"])(?.*)?\1|(?[^ \'";]+))/'; - // Tags constants used throughout. const TAG_HEAD = 'head'; const TAG_BODY = 'body'; @@ -661,7 +660,7 @@ private function convert_amp_bind_attributes( $html ) { $html ); - /** + /* * If the regex engine incurred an error during processing, for example exceeding the backtrack * limit, $converted will be null. In this case we return the originally passed document to allow * DOMDocument to attempt to load it. If the AMP HTML doesn't make use of amp-bind or similar diff --git a/tests/php/test-class-amp-dom-document.php b/tests/php/test-class-amp-dom-document.php index 8bb0cf76dec..75ad6efb606 100644 --- a/tests/php/test-class-amp-dom-document.php +++ b/tests/php/test-class-amp-dom-document.php @@ -294,7 +294,6 @@ public function test_ensuring_head_body() { $this->assertEquals( 'Hello world', $dom->body->lastChild->textContent ); } - /** * Test that invalid head nodes are moved to body. * From 41094b488fa7977effd9e25e1473f7197169ea6b Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Thu, 19 Dec 2019 07:49:52 +0100 Subject: [PATCH 59/67] Move is_valid_head_node() into Dom\Document --- .../sanitizers/class-amp-script-sanitizer.php | 2 +- includes/utils/class-amp-dom-utils.php | 30 +---- src/Dom/Document.php | 43 +++++- tests/php/test-class-amp-dom-document.php | 123 ++++++++++++++++++ tests/php/test-class-amp-dom-utils.php | 106 --------------- 5 files changed, 169 insertions(+), 135 deletions(-) diff --git a/includes/sanitizers/class-amp-script-sanitizer.php b/includes/sanitizers/class-amp-script-sanitizer.php index 8ef2b44a092..39fd377c879 100644 --- a/includes/sanitizers/class-amp-script-sanitizer.php +++ b/includes/sanitizers/class-amp-script-sanitizer.php @@ -50,7 +50,7 @@ public function sanitize() { $fragment->appendChild( $this->dom->createComment( 'noscript' ) ); while ( $noscript->firstChild ) { if ( $is_inside_head_el && ! $must_move_to_body ) { - $must_move_to_body = ! AMP_DOM_Utils::is_valid_head_node( $noscript->firstChild ); + $must_move_to_body = ! $this->dom->is_valid_head_node( $noscript->firstChild ); } $fragment->appendChild( $noscript->firstChild ); } diff --git a/includes/utils/class-amp-dom-utils.php b/includes/utils/class-amp-dom-utils.php index 2268599f8c1..f033ea8cb30 100644 --- a/includes/utils/class-amp-dom-utils.php +++ b/includes/utils/class-amp-dom-utils.php @@ -47,26 +47,6 @@ class AMP_DOM_Utils { */ const HTML_BODY_CONTENTS_PATTERN = '#^.*?(.*).*?$#si'; - /** - * List of elements allowed in head. - * - * @link https://github.com/ampproject/amphtml/blob/445d6e3be8a5063e2738c6f90fdcd57f2b6208be/validator/engine/htmlparser.js#L83-L100 - * @link https://www.w3.org/TR/html5/document-metadata.html - * - * @todo Turn into const array once PHP minimum is bumped to 5.6+. - * - * @var array - */ - private static $elements_allowed_in_head = [ - 'title', - 'base', - 'link', - 'meta', - 'style', - 'noscript', - 'script', - ]; - /** * Return a valid Dom\Document representing HTML document passed as a parameter. * @@ -88,18 +68,14 @@ public static function get_dom( $document, $encoding = null ) { * * @link https://github.com/ampproject/amphtml/blob/445d6e3be8a5063e2738c6f90fdcd57f2b6208be/validator/engine/htmlparser.js#L83-L100 * @link https://www.w3.org/TR/html5/document-metadata.html + * @deprecated Use Amp\AmpWP\Dom\Document->is_valid_head_node() instead. * * @param DOMNode $node Node. * @return bool Whether valid head node. */ public static function is_valid_head_node( DOMNode $node ) { - return ( - ( $node instanceof DOMElement && in_array( $node->nodeName, self::$elements_allowed_in_head, true ) ) - || - ( $node instanceof DOMText && preg_match( '/^\s*$/', $node->nodeValue ) ) // Whitespace text nodes are OK. - || - $node instanceof DOMComment - ); + _deprecated_function( __METHOD__, '1.5.0', 'Amp\AmpWP\Dom\Document->is_valid_head_node()' ); + return Document::from_node( $node )->is_valid_head_node( $node ); } /** diff --git a/src/Dom/Document.php b/src/Dom/Document.php index 418bf0fcab4..cef08c11a84 100644 --- a/src/Dom/Document.php +++ b/src/Dom/Document.php @@ -8,9 +8,11 @@ namespace Amp\AmpWP\Dom; use AMP_DOM_Utils; +use DOMComment; use DOMDocument; use DOMElement; use DOMNode; +use DOMText; use DOMXPath; /** @@ -180,6 +182,26 @@ final class Document extends DOMDocument { 'wbr', ]; + /** + * List of elements allowed in head. + * + * @link https://github.com/ampproject/amphtml/blob/445d6e3be8a5063e2738c6f90fdcd57f2b6208be/validator/engine/htmlparser.js#L83-L100 + * @link https://www.w3.org/TR/html5/document-metadata.html + * + * @todo Turn into const array once PHP minimum is bumped to 5.6+. + * + * @var string[] + */ + private static $elements_allowed_in_head = [ + 'title', + 'base', + 'link', + 'meta', + 'style', + 'noscript', + 'script', + ]; + /** * Store the placeholder comments that were generated to replace