From d2662334c7f1d1b95eb526dc748f99b6c3922b2f Mon Sep 17 00:00:00 2001 From: Weston Ruter Date: Wed, 24 Jan 2018 01:18:02 -0800 Subject: [PATCH 1/7] Preserve AMP binding attributes through HTML parsing and serialization --- includes/utils/class-amp-dom-utils.php | 108 +++++++++++++++++++++ phpcs.xml | 2 +- tests/test-class-amp-dom-utils.php | 29 ++++++ tests/test-tag-and-attribute-sanitizer.php | 4 + 4 files changed, 142 insertions(+), 1 deletion(-) diff --git a/includes/utils/class-amp-dom-utils.php b/includes/utils/class-amp-dom-utils.php index 8ffd3653e2c..4bb2dc89a87 100644 --- a/includes/utils/class-amp-dom-utils.php +++ b/includes/utils/class-amp-dom-utils.php @@ -56,6 +56,8 @@ public static function get_dom( $document ) { $dom = new DOMDocument(); + $document = self::convert_amp_bind_attributes( $document ); + /* * Wrap in dummy tags, since XML needs one parent node. * It also makes it easier to loop through nodes. @@ -74,6 +76,110 @@ public static function get_dom( $document ) { return $dom; } + /** + * Get attribute prefix for converted amp-bind attributes. + * + * This contains a random string to prevent HTML content containing this data- attribute + * originally from being mutated to contain an amp-bind attribute when attributes are restored. + * + * @since 0.7 + * @see \AMP_DOM_Utils::convert_amp_bind_attributes() + * @see \AMP_DOM_Utils::restore_amp_bind_attributes() + * @link https://www.ampproject.org/docs/reference/components/amp-bind + * + * @return string HTML5 data-* attribute name prefix for AMP binding attributes. + */ + public static function get_amp_bind_placeholder_attribute_prefix() { + static $attribute_prefix; + if ( ! isset( $attribute_prefix ) ) { + $attribute_prefix = sprintf( 'data-amp-binding-%s-', md5( wp_rand() ) ); + } + return $attribute_prefix; + } + + /** + * Replace AMP binding attributes with something that libxml can parse (as HTML5 data-* attributes). + * + * This is necessary necessary because attributes in square brackets are not understood in PHP and + * get dropped with an error raised: + * > Warning: DOMDocument::loadHTML(): error parsing attribute name + * This is a reciprocal function of AMP_DOM_Utils::restore_amp_bind_attributes(). + * + * @since 0.7 + * @see \AMP_DOM_Utils::convert_amp_bind_attributes() + * @link https://www.ampproject.org/docs/reference/components/amp-bind + * + * @param string $html HTML containing amp-bind attributes. + * @return string HTML with AMP binding attributes replaced with HTML5 data-* attributes. + */ + public static function convert_amp_bind_attributes( $html ) { + $amp_bind_attr_prefix = self::get_amp_bind_placeholder_attribute_prefix(); + + // Pattern for HTML attribute accounting for binding attr name, boolean attribute, single/double-quoted attribute value, and unquoted attribute values. + $attr_regex = '#^\s+(?P\[?[a-zA-Z0-9_\-]+\]?)(?P=(?:"[^"]*"|\'[^\']*\'|[^\'"\s]+))?#'; + + /** + * Replace callback. + * + * @param array $tag_matches Tag matches. + * @return string Replacement. + */ + $replace_callback = function( $tag_matches ) use ( $amp_bind_attr_prefix, $attr_regex ) { + $old_attrs = rtrim( $tag_matches['attrs'] ); + $new_attrs = ''; + $offset = 0; + while ( preg_match( $attr_regex, substr( $old_attrs, $offset ), $attr_matches ) ) { + $offset += strlen( $attr_matches[0] ); + + if ( '[' === $attr_matches['name'][0] ) { + $new_attrs .= ' ' . $amp_bind_attr_prefix . trim( $attr_matches['name'], '[]' ); + if ( isset( $attr_matches['value'] ) ) { + $new_attrs .= $attr_matches['value']; + } + } else { + $new_attrs .= $attr_matches[0]; + } + } + + // Bail on parse error which occurs when the regex isn't able to consume the entire $new_attrs string. + if ( strlen( $old_attrs ) !== $offset ) { + return $tag_matches[0]; + } + + return '<' . $tag_matches['name'] . $new_attrs . '>'; + }; + + $html = preg_replace_callback( + // Match all start tags that probably contain a binding attribute. + '#<(?P\w\S+)(?P\s+[^<]+\]=[^<]+)\s*>#', + $replace_callback, + $html + ); + + return $html; + } + + /** + * Convert AMP bind-attributes back to their original syntax. + * + * This is a reciprocal function of AMP_DOM_Utils::convert_amp_bind_attributes(). + * + * @since 0.7 + * @see \AMP_DOM_Utils::convert_amp_bind_attributes() + * @link https://www.ampproject.org/docs/reference/components/amp-bind + * + * @param string $html HTML with amp-bind attributes converted. + * @return string HTML with amp-bind attributes restored. + */ + public static function restore_amp_bind_attributes( $html ) { + $html = preg_replace( + '#\s' . self::get_amp_bind_placeholder_attribute_prefix() . '([a-zA-Z0-9_\-]+)#', + ' [$1]', + $html + ); + return $html; + } + /** * Return a valid DOMDocument representing arbitrary HTML content passed as a parameter. * @@ -175,6 +281,8 @@ public static function get_content_from_dom_node( $dom, $node ) { return ''; } + $html = self::restore_amp_bind_attributes( $html ); + /* * Travis w/PHP 7.1 generates

and
vs.
and
, respectively. * Travis w/PHP 7.x generates vs. . Etc. diff --git a/phpcs.xml b/phpcs.xml index a64861cc5c1..70f790323ea 100644 --- a/phpcs.xml +++ b/phpcs.xml @@ -30,7 +30,7 @@ - + bin/* diff --git a/tests/test-class-amp-dom-utils.php b/tests/test-class-amp-dom-utils.php index 261e77f6596..a76bca4e25c 100644 --- a/tests/test-class-amp-dom-utils.php +++ b/tests/test-class-amp-dom-utils.php @@ -90,4 +90,33 @@ public function test__get_content_from_dom__br_no_closing_tag() { $this->assertEquals( $expected, $actual ); } + + /** + * Test convert_amp_bind_attributes. + * + * @covers \AMP_DOM_Utils::convert_amp_bind_attributes() + * @covers \AMP_DOM_Utils::restore_amp_bind_attributes() + * @covers \AMP_DOM_Utils::get_amp_bind_placeholder_attribute_prefix() + * \AMP_DOM_Utils::restore_amp_bind_attributes() + */ + public function test_amp_bind_conversion() { + $original = ''; + $converted = AMP_DOM_Utils::convert_amp_bind_attributes( $original ); + $this->assertNotEquals( $converted, $original ); + $this->assertContains( AMP_DOM_Utils::get_amp_bind_placeholder_attribute_prefix() . 'src="myAnimals[currentAnimal].imageUrl"', $converted ); + $this->assertContains( 'width=300 height="200" data-foo="bar" selected', $converted ); + $restored = AMP_DOM_Utils::restore_amp_bind_attributes( $converted ); + $this->assertEquals( $original, $restored ); + + // Test malformed. + $malformed_html = array( + '', + '', + '', + ); + foreach ( $malformed_html as $html ) { + $converted = AMP_DOM_Utils::convert_amp_bind_attributes( $html ); + $this->assertNotContains( AMP_DOM_Utils::get_amp_bind_placeholder_attribute_prefix(), $converted ); + } + } } diff --git a/tests/test-tag-and-attribute-sanitizer.php b/tests/test-tag-and-attribute-sanitizer.php index d19395a8a82..649b3651783 100644 --- a/tests/test-tag-and-attribute-sanitizer.php +++ b/tests/test-tag-and-attribute-sanitizer.php @@ -520,6 +520,10 @@ public function get_data() { 'HeadlineSpan', 'HeadlineSpan', ), + + 'amp_bind_attr' => array( + '

Hello World

', + ), ); } From cbe9d53a9165d10ba2f4ee950a0ef9797dacab55 Mon Sep 17 00:00:00 2001 From: Weston Ruter Date: Wed, 24 Jan 2018 13:46:16 -0800 Subject: [PATCH 2/7] Add required AMP component scripts only if element is not sanitized out of doc --- .../class-amp-tag-and-attribute-sanitizer.php | 18 ++++++++++-------- tests/test-class-amp-dom-utils.php | 1 - 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php b/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php index abbb2fa5c12..2f439b0fb66 100644 --- a/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php +++ b/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php @@ -279,19 +279,21 @@ private function process_node( $node ) { return; } - // Obtain list of component scripts required. - if ( ! empty( $tag_spec['also_requires_tag_warning'] ) ) { - $this->script_components[] = strtok( $tag_spec['also_requires_tag_warning'][0], ' ' ); - } - if ( ! empty( $tag_spec['requires_extension'] ) ) { - $this->script_components = array_merge( $this->script_components, $tag_spec['requires_extension'] ); - } - // Remove any remaining disallowed attributes. $this->sanitize_disallowed_attributes_in_node( $node, $attr_spec_list ); // Remove values that don't conform to the attr_spec. $this->sanitize_disallowed_attribute_values_in_node( $node, $attr_spec_list ); + + // Add required AMP component scripts if the element is still in the document. + if ( $node->parentNode ) { + if ( ! empty( $tag_spec['also_requires_tag_warning'] ) ) { + $this->script_components[] = strtok( $tag_spec['also_requires_tag_warning'][0], ' ' ); + } + if ( ! empty( $tag_spec['requires_extension'] ) ) { + $this->script_components = array_merge( $this->script_components, $tag_spec['requires_extension'] ); + } + } } /** diff --git a/tests/test-class-amp-dom-utils.php b/tests/test-class-amp-dom-utils.php index a76bca4e25c..78501b0858c 100644 --- a/tests/test-class-amp-dom-utils.php +++ b/tests/test-class-amp-dom-utils.php @@ -97,7 +97,6 @@ public function test__get_content_from_dom__br_no_closing_tag() { * @covers \AMP_DOM_Utils::convert_amp_bind_attributes() * @covers \AMP_DOM_Utils::restore_amp_bind_attributes() * @covers \AMP_DOM_Utils::get_amp_bind_placeholder_attribute_prefix() - * \AMP_DOM_Utils::restore_amp_bind_attributes() */ public function test_amp_bind_conversion() { $original = ''; From 1e1c023b9a83676a4528e9c72366bc0a3bcbb186 Mon Sep 17 00:00:00 2001 From: Weston Ruter Date: Wed, 24 Jan 2018 17:15:18 -0800 Subject: [PATCH 3/7] Add tests for scripts discovered during whitelist sanitization, inc. amp-bind --- tests/test-tag-and-attribute-sanitizer.php | 97 +++++++++++++++++++++- 1 file changed, 95 insertions(+), 2 deletions(-) diff --git a/tests/test-tag-and-attribute-sanitizer.php b/tests/test-tag-and-attribute-sanitizer.php index 649b3651783..c18a5b3d3a3 100644 --- a/tests/test-tag-and-attribute-sanitizer.php +++ b/tests/test-tag-and-attribute-sanitizer.php @@ -19,91 +19,128 @@ public function get_data() { 'a4a' => array( '
', + null, // No change. + array( 'amp-ad' ), ), 'ads' => array( '
', + null, // No change. + array( 'amp-ad' ), ), 'adsense' => array( '
', + null, // No change. + array( 'amp-ad' ), ), 'amp-3q-player' => array( '', + null, + array( 'amp-3q-player' ), ), 'amp-ad' => array( '', + null, // No change. + array( 'amp-ad' ), ), 'amp-ad-exit' => array( '', + null, // No change. + array( 'amp-ad-exit' ), ), 'amp-animation' => array( '', + null, // No change. + array( 'amp-animation' ), ), 'amp-call-tracking' => array( '+1 (23) 456-789', + null, + array( 'amp-call-tracking' ), ), 'amp-call-tracking_blacklisted_config' => array( '+1 (23) 456-789', '', + array(), // Important: This needs to be empty because the amp-call-tracking is stripped. ), 'amp-embed' => array( '', + null, // No change. + array( 'amp-ad' ), ), 'amp-facebook-comments' => array( '', + null, // No change. + array( 'amp-facebook-comments' ), ), 'amp-facebook-comments_missing_required_attribute' => array( '', '', + array(), // Empty because invalid. ), 'amp-facebook-like' => array( '', + null, // No change. + array( 'amp-facebook-like' ), ), 'amp-facebook-like_missing_required_attribute' => array( '', '', + array(), // Empty because invalid. ), 'amp-fit-text' => array( 'Lorem ipsum', + null, // No change. + array( 'amp-fit-text' ), ), 'amp-gist' => array( '', + null, // No change. + array( 'amp-gist' ), ), 'amp-gist_missing_mandatory_attribute' => array( '', '', + array(), ), 'amp-gwd-animation' => array( '', + null, // No change. + array( 'amp-gwd-animation' ), ), 'amp-iframe' => array( '', + null, // No change. + array( 'amp-iframe' ), ), 'amp-iframe_incorrect_protocol' => array( '', '', + array( 'amp-iframe' ), ), 'amp-ima-video' => array( '', + null, // No change. + array( 'amp-ima-video' ), ), 'amp-ima-video_missing_required_attribute' => array( @@ -113,52 +150,74 @@ public function get_data() { 'amp-imgur' => array( '', + null, // No change. + array( 'amp-imgur' ), ), 'amp-install-serviceworker' => array( '', + null, // No change. + array( 'amp-install-serviceworker' ), ), 'amp-izlesene' => array( '', + null, // No change. + array( 'amp-izlesene' ), ), 'amp-nexxtv-player' => array( '', + null, // No change. + array( 'amp-nexxtv-player' ), ), 'amp-playbuzz' => array( '', + null, // No change. + array( 'amp-playbuzz' ), ), 'amp-playbuzz_no_src' => array( '', + null, // @todo This actually should be stripped because . + array( 'amp-playbuzz' ), ), 'amp-position-observer' => array( '', + null, // No change. + array( 'amp-position-observer' ), ), 'amp-twitter' => array( '', + null, // No change. + array( 'amp-twitter' ), ), 'amp-user-notification' => array( 'This site uses cookies to personalize content.I accept', 'This site uses cookies to personalize content.I accept', + array( 'amp-user-notification' ), ), 'amp-video' => array( '', + null, // No change. + array( 'amp-video' ), ), 'amp-vk' => array( '', + null, // No change. + array( 'amp-vk' ), ), 'amp-apester-media' => array( '', '', + array( 'amp-apester-media' ), ), 'button' => array( @@ -169,31 +228,37 @@ public function get_data() { 'brid-player' => array( '', '', + array( 'amp-brid-player' ), ), 'brightcove' => array( '', '', + array( 'amp-brightcove' ), ), 'carousel' => array( '
hello world
hello world
', '
hello world
hello world
', + array( 'amp-anim', 'amp-audio', 'amp-brightcove', 'amp-carousel', 'amp-dailymotion', 'amp-soundcloud', 'amp-video', 'amp-vimeo', 'amp-vine', 'amp-youtube' ), ), 'amp-dailymotion' => array( '

Default (responsive)

Custom

', '

Default (responsive)

Custom

', + array( 'amp-dailymotion' ), ), 'doubleclick-1' => array( '', '', + array( 'amp-ad' ), ), 'facebook' => array( '

More Posts

', '

More Posts

', + array( 'amp-facebook' ), ), 'font' => array( @@ -204,11 +269,13 @@ public function get_data() { 'form' => array( '
', '
', + array( 'amp-form' ), ), 'gfycat' => array( '', '', + array( 'amp-gfycat' ), ), 'h2' => array( @@ -251,16 +318,21 @@ public function get_data() { 'attribute_value_valid' => array( '', + null, + array( 'amp-mustache' ), ), 'attribute_value_invalid' => array( // type is mandatory, so the node is removed. '', '', + array(), // No scripts because removed. ), 'attribute_amp_accordion_value' => array( 'test', + null, // No change. + array( 'amp-accordion' ), ), 'attribute_value_with_blacklisted_regex_removed' => array( @@ -301,26 +373,33 @@ public function get_data() { 'attribute_value_with_value_regex_casei_lower' => array( '', + null, // No change. + array( 'amp-dailymotion' ), ), 'attribute_value_with_value_regex_casei_upper' => array( '', + null, // No change. + array( 'amp-dailymotion' ), ), 'attribute_value_with_bad_value_regex_casei_removed' => array( // data-ui-logo should be true|false. '', '', + array( 'amp-dailymotion' ), ), 'attribute_bad_attr_with_no_value_removed' => array( 'something here', 'something here', + array( 'amp-ad' ), ), 'attribute_bad_attr_with_value_removed' => array( 'something here', 'something here', + array( 'amp-ad' ), ), 'remove_node_with_invalid_mandatory_attribute' => array( @@ -346,6 +425,8 @@ public function get_data() { 'allow_node_with_valid_mandatory_attribute' => array( '', + null, // No change. + array( 'amp-analytics' ), ), 'nodes_with_non_whitelisted_tags_replaced_by_children' => array( @@ -371,11 +452,13 @@ public function get_data() { 'leave_attribute_on_node_with_present_mandatory_parent' => array( '
This is a test.
', '
This is a test.
', + array( 'amp-form' ), ), 'disallowed_empty_attr_removed' => array( '', '', + array( 'amp-user-notification' ), ), 'allowed_empty_attr' => array( @@ -385,6 +468,7 @@ public function get_data() { 'remove_node_with_disallowed_ancestor' => array( 'The sidebarThis node is not allowed here.', 'The sidebar', + array( 'amp-sidebar' ), ), 'remove_node_without_mandatory_ancestor' => array( @@ -523,6 +607,13 @@ public function get_data() { 'amp_bind_attr' => array( '

Hello World

', + null, // No change. + array( 'amp-bind' ), + ), + + 'amp_bad_bind_attr' => array( + '

Hello World

', + '

Hello World

', ), ); } @@ -563,10 +654,11 @@ public function test_is_missing_mandatory_attribute() { * * @dataProvider get_data * @group allowed-tags - * @param string $source Markup to process. + * @param string $source Markup to process. * @param string $expected The markup to expect. + * @param array $scripts The AMP component script names that are obtained through sanitization. */ - public function test_sanitizer( $source, $expected = null ) { + public function test_sanitizer( $source, $expected = null, $scripts = array() ) { $expected = isset( $expected ) ? $expected : $source; $dom = AMP_DOM_Utils::get_dom_from_content( $source ); $sanitizer = new AMP_Tag_And_Attribute_Sanitizer( $dom ); @@ -574,5 +666,6 @@ public function test_sanitizer( $source, $expected = null ) { $content = AMP_DOM_Utils::get_content_from_dom( $dom ); $content = preg_replace( '/(?<=>)\s+(?=<)/', '', $content ); $this->assertEquals( $expected, $content ); + $this->assertEqualSets( $scripts, array_keys( $sanitizer->get_scripts() ) ); } } From 567f1d5e821f376aae8cf5cf308f2d70124c6a59 Mon Sep 17 00:00:00 2001 From: Weston Ruter Date: Wed, 24 Jan 2018 22:28:05 -0800 Subject: [PATCH 4/7] Validate bound attributes by putting placeholders in spec --- .../class-amp-tag-and-attribute-sanitizer.php | 47 +++++++++++++++---- includes/utils/class-amp-dom-utils.php | 10 ++-- tests/test-class-amp-dom-utils.php | 6 +-- tests/test-tag-and-attribute-sanitizer.php | 4 +- 4 files changed, 47 insertions(+), 20 deletions(-) diff --git a/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php b/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php index 2f439b0fb66..021011873fa 100644 --- a/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php +++ b/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php @@ -91,16 +91,27 @@ public function __construct( $dom, $args = array() ) { 'amp_allowed_tags' => AMP_Allowed_Tags_Generated::get_allowed_tags(), 'amp_globally_allowed_attributes' => AMP_Allowed_Tags_Generated::get_allowed_attributes(), 'amp_layout_allowed_attributes' => AMP_Allowed_Tags_Generated::get_layout_attributes(), + 'amp_bind_placeholder_prefix' => AMP_DOM_Utils::get_amp_bind_placeholder_prefix(), ); parent::__construct( $dom, $args ); - /** - * Prepare whitelists - */ - $this->allowed_tags = $this->args['amp_allowed_tags']; - $this->globally_allowed_attributes = $this->args['amp_globally_allowed_attributes']; - $this->layout_allowed_attributes = $this->args['amp_layout_allowed_attributes']; + // Prepare whitelists. + $this->allowed_tags = $this->args['amp_allowed_tags']; + foreach ( AMP_Rule_Spec::$additional_allowed_tags as $tag_name => $tag_rule_spec ) { + $this->allowed_tags[ $tag_name ][] = $tag_rule_spec; + } + + // @todo Add the placeholder bind attribute to the alternative names instead. + foreach ( $this->allowed_tags as &$tag_specs ) { + foreach ( $tag_specs as &$tag_spec ) { + if ( isset( $tag_spec[ AMP_Rule_Spec::ATTR_SPEC_LIST ] ) ) { + $tag_spec[ AMP_Rule_Spec::ATTR_SPEC_LIST ] = $this->convert_bind_attributes( $tag_spec[ AMP_Rule_Spec::ATTR_SPEC_LIST ] ); + } + } + } + $this->globally_allowed_attributes = $this->convert_bind_attributes( $this->args['amp_globally_allowed_attributes'] ); + $this->layout_allowed_attributes = $this->convert_bind_attributes( $this->args['amp_layout_allowed_attributes'] ); } /** @@ -127,6 +138,26 @@ public function get_scripts() { return $scripts; } + /** + * Modify attribute spec list to replace AMP binding attributes with the placeholders. + * + * @since 0.7 + * + * @param array $attr_spec_list Attribute spec list. + * @return array Modified attribute spec list. + */ + private function convert_bind_attributes( $attr_spec_list ) { + foreach ( array_keys( $attr_spec_list ) as $attr_name ) { + if ( '[' === $attr_name[0] ) { + $actual_attr_name = $this->args['amp_bind_placeholder_prefix'] . trim( $attr_name, '[]' ); + + $attr_spec_list[ $actual_attr_name ] = $attr_spec_list[ $attr_name ]; + unset( $attr_spec_list[ $attr_name ] ); + } + } + return $attr_spec_list; + } + /** * Sanitize the