From e2a714666d18673b4760b5f2a091b9997731626c Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 2 Aug 2023 15:55:20 -0700 Subject: [PATCH] HTML API: Add functions to read inner and outer HTML. --- .../html-api/class-wp-html-processor.php | 123 +++++++++++++++++- .../wpHtmlProcessorGetInnerMarkup.php | 93 +++++++++++++ .../wpHtmlProcessorGetOuterMarkup.php | 95 ++++++++++++++ 3 files changed, 305 insertions(+), 6 deletions(-) create mode 100644 tests/phpunit/tests/html-api/wpHtmlProcessorGetInnerMarkup.php create mode 100644 tests/phpunit/tests/html-api/wpHtmlProcessorGetOuterMarkup.php diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 4dd7d0e492d9e..f652900e2455d 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -417,6 +417,84 @@ public function next_tag( $query = null ) { return false; } + /** + * Returns the raw HTMl content inside a matched tag. + * + * @since 6.4.0 + * + * @throws Exception When unable to allocate a bookmark for internal tracking of the open tag. + * + * @return string|null The inner HTML if available, else NULL. + */ + public function get_inner_markup() { + if ( null === $this->get_tag() ) { + return null; + } + + $start = $this->current_token; + parent::set_bookmark( 'start' ); + // @TODO: add after-pop hook to turn this into a constant boolean check. + do { + $found_tag = $this->step(); + } while ( $found_tag && $this->state->stack_of_open_elements->contains_node( $start ) ); + + /* + * If there's no tag to bookmark then it means the opened tag has no closing + * and the rest of the document is contained within the inner HTML. + */ + if ( ! $found_tag ) { + $inner_html = $this->substr_bookmark( 'after', 'start' ); + parent::release_bookmark( 'start' ); + } else { + parent::set_bookmark( 'end' ); + $inner_html = $this->substr_bookmarks( 'after', 'start', 'before', 'end' ); + parent::release_bookmark( 'start' ); + parent::release_bookmark( 'end' ); + } + + return $inner_html; + } + + /** + * Returns the raw HTMl content inside a matched tag. + * + * @since 6.4.0 + * + * @throws Exception When unable to allocate a bookmark for internal tracking of the open tag. + * + * @return string|null The inner HTML if available, else NULL. + */ + public function get_outer_markup() { + if ( null === $this->get_tag() ) { + return null; + } + + $start = $this->current_token; + parent::set_bookmark( 'start' ); + // @TODO: add after-pop hook to turn this into a constant boolean check. + do { + $found_tag = $this->step(); + } while ( $found_tag && $this->state->stack_of_open_elements->contains_node( $start ) ); + + /* + * If there's no tag to bookmark then it means the opened tag has no closing + * and the rest of the document is contained within the inner HTML. + */ + if ( ! $found_tag ) { + $inner_html = $this->substr_bookmark( 'before', 'start' ); + } else { + parent::set_bookmark( 'end' ); + $did_close = $this->get_tag() === $start->node_name && $this->is_tag_closer(); + $end_position = $did_close ? 'after' : 'before'; + $inner_html = $this->substr_bookmarks( 'before', 'start', $end_position, 'end' ); + } + + parent::release_bookmark( 'start' ); + parent::release_bookmark( 'end' ); + + return $inner_html; + } + /** * Steps through the HTML document and stop at the next tag, if any. * @@ -437,12 +515,9 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) { $this->state->stack_of_open_elements->pop(); } - parent::next_tag( self::VISIT_EVERYTHING ); - } - - // Finish stepping when there are no more tokens in the document. - if ( null === $this->get_tag() ) { - return false; + if ( ! parent::next_tag( self::VISIT_EVERYTHING ) ) { + return false; + } } $this->current_token = new WP_HTML_Token( @@ -722,6 +797,42 @@ private function bookmark_tag() { return "{$this->bookmark_counter}"; } + /** + * Returns a substring of the input HTML document from a bookmark until the end. + * + * @since 6.4.0 + * + * @param string $start_position "before" to clip before bookmark, "after" to clip after. + * @param string $start Bookmark name at which to start clipping. + * @return string Clipped substring of input HTMl document. + */ + private function substr_bookmark( $start_position, $start ) { + $start_bookmark = $this->bookmarks[ $start ]; + $start_offset = 'before' === $start_position ? $start_bookmark->start : $start_bookmark->end + 1; + + return substr( $this->html, $start_offset ); + } + + /** + * Returns a substring of the input HTML document delimited by bookmarks. + * + * @since 6.4.0 + * + * @param string $start_position "before" to clip before bookmark, "after" to clip after. + * @param string $start Bookmark name at which to start clipping. + * @param string $end_position "before" to clip before bookmark, "after" to clip after. + * @param string $end Bookmark name at which to end clipping. + * @return string Clipped substring of input HTMl document. + */ + private function substr_bookmarks( $start_position, $start, $end_position, $end ) { + $start_bookmark = $this->bookmarks[ $start ]; + $end_bookmark = $this->bookmarks[ $end ]; + $start_offset = 'before' === $start_position ? $start_bookmark->start : $start_bookmark->end + 1; + $end_offset = 'before' === $end_position ? $end_bookmark->start : $end_bookmark->end + 1; + + return substr( $this->html, $start_offset, $end_offset - $start_offset ); + } + /* * HTML semantic overrides for Tag Processor */ diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorGetInnerMarkup.php b/tests/phpunit/tests/html-api/wpHtmlProcessorGetInnerMarkup.php new file mode 100644 index 0000000000000..d16aac6f9cc91 --- /dev/null +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorGetInnerMarkup.php @@ -0,0 +1,93 @@ +
' ); + + $this->assertNull( $p->get_inner_markup() ); + + $this->assertFalse( $p->next_tag( 'BUTTON' ), "Should not have found a BUTTON tag but stopped at {$p->get_tag()}." ); + $this->assertNull( $p->get_inner_markup() ); + } + + /** + * @ticket {TICKET_NUMBER} + * + * @covers WP_HTML_Processor::get_inner_markup + * + * @dataProvider data_html_with_inner_markup + * + * @since 6.4.0 + * + * @param string $html_with_target_node HTML containing a node with the `target` attribute set. + * @param string $expected_inner_markup Inner markup of target node. + */ + public function test_returns_appropriate_inner_markup( $html_with_target_node, $expected_inner_markup ) { + $p = WP_HTML_Processor::createFragment( $html_with_target_node ); + + while ( $p->next_tag() && null === $p->get_attribute( 'target' ) ) { + continue; + } + + $this->assertSame( $expected_inner_markup, $p->get_inner_markup(), 'Failed to return appropriate inner markup.' ); + } + + /** + * Data provider. + * + * @return array[] + */ + public function data_html_with_inner_markup() { + $data = array( + 'Empty elements' => array( '
', '' ), + 'Element containing only text' => array( '
inside
', 'inside' ), + 'Element with nested tags' => array( '
inside the div
', 'inside the div' ), + 'Unclosed element' => array( '
This is all inside the DIV', 'This is all inside the DIV' ), + 'Partially-closed element' => array( '
This is all inside the DIVall inside the DIV array( '

Inside the P

Outside the P

', 'Inside the P' ), + ); + + $inner_html = <<This is inside the Match

+

+
+
+ +
Look at the picture photograph.
+
+
+HTML; + + $html = << +

This is not in the match. +

This is another paragraph not in the match. +

+
{$inner_html}
+
+

This is also note in the match.

+
+HTML; + $data['Complicated inner nesting'] = array( $html, $inner_html ); + + return $data; + } +} diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorGetOuterMarkup.php b/tests/phpunit/tests/html-api/wpHtmlProcessorGetOuterMarkup.php new file mode 100644 index 0000000000000..8536cf4ea1ea7 --- /dev/null +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorGetOuterMarkup.php @@ -0,0 +1,95 @@ +
' ); + + $this->assertNull( $p->get_outer_markup() ); + + $this->assertFalse( $p->next_tag( 'BUTTON' ), "Should not have found a BUTTON tag but stopped at {$p->get_tag()}." ); + $this->assertNull( $p->get_outer_markup() ); + } + + /** + * @ticket {TICKET_NUMBER} + * + * @covers WP_HTML_Processor::get_outer_markup + * + * @dataProvider data_html_with_outer_markup + * + * @since 6.4.0 + * + * @param string $html_with_target_node HTML containing a node with the `target` attribute set. + * @param string $expected_outer_markup Outer markup of target node. + */ + public function test_returns_appropriate_outer_markup( $html_with_target_node, $expected_outer_markup ) { + $p = WP_HTML_Processor::createFragment( $html_with_target_node ); + + while ( $p->next_tag() && null === $p->get_attribute( 'target' ) ) { + continue; + } + + $this->assertSame( $expected_outer_markup, $p->get_outer_markup(), 'Failed to return appropriate inner markup.' ); + } + + /** + * Data provider. + * + * @return array[] + */ + public function data_html_with_outer_markup() { + $data = array( + 'Empty elements' => array( '
', '
' ), + 'Element containing only text' => array( '
inside
', '
inside
' ), + 'Element with nested tags' => array( '
inside the div
', '
inside the div
' ), + 'Unclosed element' => array( '
This is all inside the DIV', '
This is all inside the DIV' ), + 'Partially-closed element' => array( '
This is all inside the DIVThis is all inside the DIV array( '

Inside the P

Outside the P

', '

Inside the P' ), + ); + + $inner_html = <<This is inside the Match

+

+
+
+ +
Look at the picture photograph.
+
+
+HTML; + + $html = << +

This is not in the match. +

This is another paragraph not in the match. +

+
{$inner_html}
+
+

This is also note in the match.

+
+HTML; + $data['Complicated inner nesting'] = array( $html, "
{$inner_html}
" ); + + return $data; + } +}