From e2a714666d18673b4760b5f2a091b9997731626c Mon Sep 17 00:00:00 2001
From: Dennis Snell
Date: Wed, 2 Aug 2023 15:55:20 -0700
Subject: [PATCH] HTML API: Add functions to read inner and outer HTML.
---
.../html-api/class-wp-html-processor.php | 123 +++++++++++++++++-
.../wpHtmlProcessorGetInnerMarkup.php | 93 +++++++++++++
.../wpHtmlProcessorGetOuterMarkup.php | 95 ++++++++++++++
3 files changed, 305 insertions(+), 6 deletions(-)
create mode 100644 tests/phpunit/tests/html-api/wpHtmlProcessorGetInnerMarkup.php
create mode 100644 tests/phpunit/tests/html-api/wpHtmlProcessorGetOuterMarkup.php
diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php
index 4dd7d0e492d9e..f652900e2455d 100644
--- a/src/wp-includes/html-api/class-wp-html-processor.php
+++ b/src/wp-includes/html-api/class-wp-html-processor.php
@@ -417,6 +417,84 @@ public function next_tag( $query = null ) {
return false;
}
+ /**
+ * Returns the raw HTMl content inside a matched tag.
+ *
+ * @since 6.4.0
+ *
+ * @throws Exception When unable to allocate a bookmark for internal tracking of the open tag.
+ *
+ * @return string|null The inner HTML if available, else NULL.
+ */
+ public function get_inner_markup() {
+ if ( null === $this->get_tag() ) {
+ return null;
+ }
+
+ $start = $this->current_token;
+ parent::set_bookmark( 'start' );
+ // @TODO: add after-pop hook to turn this into a constant boolean check.
+ do {
+ $found_tag = $this->step();
+ } while ( $found_tag && $this->state->stack_of_open_elements->contains_node( $start ) );
+
+ /*
+ * If there's no tag to bookmark then it means the opened tag has no closing
+ * and the rest of the document is contained within the inner HTML.
+ */
+ if ( ! $found_tag ) {
+ $inner_html = $this->substr_bookmark( 'after', 'start' );
+ parent::release_bookmark( 'start' );
+ } else {
+ parent::set_bookmark( 'end' );
+ $inner_html = $this->substr_bookmarks( 'after', 'start', 'before', 'end' );
+ parent::release_bookmark( 'start' );
+ parent::release_bookmark( 'end' );
+ }
+
+ return $inner_html;
+ }
+
+ /**
+ * Returns the raw HTMl content inside a matched tag.
+ *
+ * @since 6.4.0
+ *
+ * @throws Exception When unable to allocate a bookmark for internal tracking of the open tag.
+ *
+ * @return string|null The inner HTML if available, else NULL.
+ */
+ public function get_outer_markup() {
+ if ( null === $this->get_tag() ) {
+ return null;
+ }
+
+ $start = $this->current_token;
+ parent::set_bookmark( 'start' );
+ // @TODO: add after-pop hook to turn this into a constant boolean check.
+ do {
+ $found_tag = $this->step();
+ } while ( $found_tag && $this->state->stack_of_open_elements->contains_node( $start ) );
+
+ /*
+ * If there's no tag to bookmark then it means the opened tag has no closing
+ * and the rest of the document is contained within the inner HTML.
+ */
+ if ( ! $found_tag ) {
+ $inner_html = $this->substr_bookmark( 'before', 'start' );
+ } else {
+ parent::set_bookmark( 'end' );
+ $did_close = $this->get_tag() === $start->node_name && $this->is_tag_closer();
+ $end_position = $did_close ? 'after' : 'before';
+ $inner_html = $this->substr_bookmarks( 'before', 'start', $end_position, 'end' );
+ }
+
+ parent::release_bookmark( 'start' );
+ parent::release_bookmark( 'end' );
+
+ return $inner_html;
+ }
+
/**
* Steps through the HTML document and stop at the next tag, if any.
*
@@ -437,12 +515,9 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) {
$this->state->stack_of_open_elements->pop();
}
- parent::next_tag( self::VISIT_EVERYTHING );
- }
-
- // Finish stepping when there are no more tokens in the document.
- if ( null === $this->get_tag() ) {
- return false;
+ if ( ! parent::next_tag( self::VISIT_EVERYTHING ) ) {
+ return false;
+ }
}
$this->current_token = new WP_HTML_Token(
@@ -722,6 +797,42 @@ private function bookmark_tag() {
return "{$this->bookmark_counter}";
}
+ /**
+ * Returns a substring of the input HTML document from a bookmark until the end.
+ *
+ * @since 6.4.0
+ *
+ * @param string $start_position "before" to clip before bookmark, "after" to clip after.
+ * @param string $start Bookmark name at which to start clipping.
+ * @return string Clipped substring of input HTMl document.
+ */
+ private function substr_bookmark( $start_position, $start ) {
+ $start_bookmark = $this->bookmarks[ $start ];
+ $start_offset = 'before' === $start_position ? $start_bookmark->start : $start_bookmark->end + 1;
+
+ return substr( $this->html, $start_offset );
+ }
+
+ /**
+ * Returns a substring of the input HTML document delimited by bookmarks.
+ *
+ * @since 6.4.0
+ *
+ * @param string $start_position "before" to clip before bookmark, "after" to clip after.
+ * @param string $start Bookmark name at which to start clipping.
+ * @param string $end_position "before" to clip before bookmark, "after" to clip after.
+ * @param string $end Bookmark name at which to end clipping.
+ * @return string Clipped substring of input HTMl document.
+ */
+ private function substr_bookmarks( $start_position, $start, $end_position, $end ) {
+ $start_bookmark = $this->bookmarks[ $start ];
+ $end_bookmark = $this->bookmarks[ $end ];
+ $start_offset = 'before' === $start_position ? $start_bookmark->start : $start_bookmark->end + 1;
+ $end_offset = 'before' === $end_position ? $end_bookmark->start : $end_bookmark->end + 1;
+
+ return substr( $this->html, $start_offset, $end_offset - $start_offset );
+ }
+
/*
* HTML semantic overrides for Tag Processor
*/
diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorGetInnerMarkup.php b/tests/phpunit/tests/html-api/wpHtmlProcessorGetInnerMarkup.php
new file mode 100644
index 0000000000000..d16aac6f9cc91
--- /dev/null
+++ b/tests/phpunit/tests/html-api/wpHtmlProcessorGetInnerMarkup.php
@@ -0,0 +1,93 @@
+
' );
+
+ $this->assertNull( $p->get_inner_markup() );
+
+ $this->assertFalse( $p->next_tag( 'BUTTON' ), "Should not have found a BUTTON tag but stopped at {$p->get_tag()}." );
+ $this->assertNull( $p->get_inner_markup() );
+ }
+
+ /**
+ * @ticket {TICKET_NUMBER}
+ *
+ * @covers WP_HTML_Processor::get_inner_markup
+ *
+ * @dataProvider data_html_with_inner_markup
+ *
+ * @since 6.4.0
+ *
+ * @param string $html_with_target_node HTML containing a node with the `target` attribute set.
+ * @param string $expected_inner_markup Inner markup of target node.
+ */
+ public function test_returns_appropriate_inner_markup( $html_with_target_node, $expected_inner_markup ) {
+ $p = WP_HTML_Processor::createFragment( $html_with_target_node );
+
+ while ( $p->next_tag() && null === $p->get_attribute( 'target' ) ) {
+ continue;
+ }
+
+ $this->assertSame( $expected_inner_markup, $p->get_inner_markup(), 'Failed to return appropriate inner markup.' );
+ }
+
+ /**
+ * Data provider.
+ *
+ * @return array[]
+ */
+ public function data_html_with_inner_markup() {
+ $data = array(
+ 'Empty elements' => array( '', '' ),
+ 'Element containing only text' => array( 'inside
', 'inside' ),
+ 'Element with nested tags' => array( 'inside the div
', 'inside the div' ),
+ 'Unclosed element' => array( 'This is
all inside the DIV', 'This is
all inside the DIV' ),
+ 'Partially-closed element' => array( '
This is all inside the DIV
all inside the DIV
array( 'Outside the P
', 'Inside the P' ),
+ );
+
+ $inner_html = <<This is inside the Match
+
+
+
+
+HTML;
+
+ $html = <<
+ This is not in the match.
+
This is another paragraph not in the match.
+
+
{$inner_html}
+
+
This is also note in the match.
+
+HTML;
+ $data['Complicated inner nesting'] = array( $html, $inner_html );
+
+ return $data;
+ }
+}
diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorGetOuterMarkup.php b/tests/phpunit/tests/html-api/wpHtmlProcessorGetOuterMarkup.php
new file mode 100644
index 0000000000000..8536cf4ea1ea7
--- /dev/null
+++ b/tests/phpunit/tests/html-api/wpHtmlProcessorGetOuterMarkup.php
@@ -0,0 +1,95 @@
+
' );
+
+ $this->assertNull( $p->get_outer_markup() );
+
+ $this->assertFalse( $p->next_tag( 'BUTTON' ), "Should not have found a BUTTON tag but stopped at {$p->get_tag()}." );
+ $this->assertNull( $p->get_outer_markup() );
+ }
+
+ /**
+ * @ticket {TICKET_NUMBER}
+ *
+ * @covers WP_HTML_Processor::get_outer_markup
+ *
+ * @dataProvider data_html_with_outer_markup
+ *
+ * @since 6.4.0
+ *
+ * @param string $html_with_target_node HTML containing a node with the `target` attribute set.
+ * @param string $expected_outer_markup Outer markup of target node.
+ */
+ public function test_returns_appropriate_outer_markup( $html_with_target_node, $expected_outer_markup ) {
+ $p = WP_HTML_Processor::createFragment( $html_with_target_node );
+
+ while ( $p->next_tag() && null === $p->get_attribute( 'target' ) ) {
+ continue;
+ }
+
+ $this->assertSame( $expected_outer_markup, $p->get_outer_markup(), 'Failed to return appropriate inner markup.' );
+ }
+
+ /**
+ * Data provider.
+ *
+ * @return array[]
+ */
+ public function data_html_with_outer_markup() {
+ $data = array(
+ 'Empty elements' => array( '', '' ),
+ 'Element containing only text' => array( 'inside
', 'inside
' ),
+ 'Element with nested tags' => array( 'inside the div
', 'inside the div
' ),
+ 'Unclosed element' => array( 'This is
all inside the DIV', '
This is
all inside the DIV' ),
+ 'Partially-closed element' => array( '
This is all inside the DIV
This is
all inside the DIV
array( '
Outside the P', '
Inside the P' ),
+ );
+
+ $inner_html = <<This is inside the Match
+
+
+
+
+HTML;
+
+ $html = <<
+
This is not in the match.
+
This is another paragraph not in the match.
+
+ {$inner_html}
+
+
This is also note in the match.
+
+HTML;
+ $data['Complicated inner nesting'] = array( $html, "{$inner_html}
" );
+
+ return $data;
+ }
+}