Skip to content

Commit

Permalink
Deduplicate head or body nodes
Browse files Browse the repository at this point in the history
  • Loading branch information
schlessera committed Jan 15, 2020
1 parent 51838be commit 22727d1
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 0 deletions.
60 changes: 60 additions & 0 deletions src/Dom/Document.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

namespace Amp\AmpWP\Dom;

use DOMAttr;
use DOMComment;
use DOMDocument;
use DOMElement;
Expand Down Expand Up @@ -355,6 +356,8 @@ public function loadHTML( $source, $options = 0 ) {
$this->head->insertBefore( $charset, $this->head->firstChild );

// Do some further clean-up.
$this->deduplicate_tag( self::TAG_HEAD );
$this->deduplicate_tag( self::TAG_BODY );
$this->move_invalid_head_nodes_to_body();
}

Expand Down Expand Up @@ -1018,6 +1021,63 @@ private function restore_doctype_node( $html ) {
return preg_replace( self::HTML_RESTORE_DOCTYPE_PATTERN, '\1!\3\4>', $html, 1 );
}

/**
* Deduplicate a given tag.
*
* This keeps the first tag as the main tag and moves over all child nodes and attribute nodes from any subsequent
* same tags over to remove them.
*
* @param string $tag_name Name of the tag to deduplicate.
*/
public function deduplicate_tag( $tag_name ) {
$tags = $this->getElementsByTagName( $tag_name );

/**
* Main tag to keep.
*
* @var DOMElement $main_tag
*/
$main_tag = $tags->item( 0 );

if ( null === $main_tag ) {
return;
}

while ( $tags->length > 1 ) {
/**
* Tag to remove.
*
* @var DOMElement $tag_to_remove
*/
$tag_to_remove = $tags->item( 1 );

foreach ( $tag_to_remove->childNodes as $child_node ) {
$main_tag->appendChild( $child_node->parentNode->removeChild( $child_node ) );
}

while ( $tag_to_remove->hasAttributes() ) {
/**
* Attribute node to move over to the main tag.
*
* @var DOMAttr $attribute
*/
$attribute = $tag_to_remove->attributes->item( 0 );
$tag_to_remove->removeAttributeNode( $attribute );

// @TODO This doesn't deal properly with attributes present on both tags. Maybe overkill to add?
// We could move over the copy_attributes from AMP_DOM_Utils to do this.
$main_tag->setAttributeNode( $attribute );
}

$tag_to_remove->parentNode->removeChild( $tag_to_remove );
}

// Avoid doing the above query again if possible.
if ( in_array( $tag_name, [ self::TAG_HEAD, self::TAG_BODY ], true ) ) {
$this->$tag_name = $main_tag;
}
}

/**
* Determine whether a node can be in the head.
*
Expand Down
20 changes: 20 additions & 0 deletions tests/php/test-class-amp-dom-document.php
Original file line number Diff line number Diff line change
Expand Up @@ -33,16 +33,36 @@ public function data_dom_document() {
'<!DOCTYPE html><html amp lang="en">' . $head . '<body class="some-class"><p>Text</p></body></html>',
'<!DOCTYPE html><html amp lang="en">' . $head . '<body class="some-class"><p>Text</p></body></html>',
],
'html_attributes' => [
'utf-8',
'<!DOCTYPE html><html lang="en-US" class="no-js">' . $head . '<body></body></html>',
'<!DOCTYPE html><html lang="en-US" class="no-js">' . $head . '<body></body></html>',
],
'head_attributes' => [
'utf-8',
'<!DOCTYPE html><html><head itemscope itemtype="http://schema.org/WebSite"></head><body></body></html>',
'<!DOCTYPE html><html><head itemscope itemtype="http://schema.org/WebSite"><meta charset="utf-8"></head><body></body></html>',
],
'missing_head' => [
'utf-8',
'<!DOCTYPE html><html amp lang="en"><body class="some-class"><p>Text</p></body></html>',
'<!DOCTYPE html><html amp lang="en">' . $head . '<body class="some-class"><p>Text</p></body></html>',
],
'multiple_heads' => [
'utf-8',
'<!DOCTYPE html><html amp lang="en"><head itemscope itemtype="http://schema.org/WebSite"><meta name="first" content="something"></head><head data-something="else"><meta name="second" content="something-else"></head><body class="some-class"><p>Text</p></body></html>',
'<!DOCTYPE html><html amp lang="en"><head itemscope itemtype="http://schema.org/WebSite" data-something="else"><meta charset="utf-8"><meta name="first" content="something"><meta name="second" content="something-else"></head><body class="some-class"><p>Text</p></body></html>',
],
'missing_body' => [
'utf-8',
'<!DOCTYPE html><html amp lang="en">' . $head . '<p>Text</p></html>',
'<!DOCTYPE html><html amp lang="en">' . $head . '<body><p>Text</p></body></html>',
],
'multiple_bodies' => [
'utf-8',
'<!DOCTYPE html><html amp lang="en">' . $head . '<body class="no-js"><p>Text</p></body><body data-some-attribute="to keep"><p>Yet another Text</p></body></html>',
'<!DOCTYPE html><html amp lang="en">' . $head . '<body class="no-js" data-some-attribute="to keep"><p>Text</p><p>Yet another Text</p></body></html>',
],
'missing_head_and_body' => [
'utf-8',
'<!DOCTYPE html><html amp lang="en"><p>Text</p></html>',
Expand Down

0 comments on commit 22727d1

Please sign in to comment.