From a25b57a8f818d47477e9a5421e8b7d74fa8ac36e Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 17 Nov 2023 15:42:30 -0700 Subject: [PATCH] Add `next_token()` method --- .../html-api/class-wp-html-tag-processor.php | 491 ++++++++++-------- 1 file changed, 264 insertions(+), 227 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 4cb3092f6c0a5..14354e9040125 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -393,7 +393,7 @@ class WP_HTML_Tag_Processor { * * @var string|null */ - private $last_token_type = null; + public $last_token_type = null; /** * In what mode the parser should resume after pausing, @@ -566,86 +566,117 @@ public function next_tag( $query = null ) { $already_found = 0; do { - if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { - return false; - } - - // Find the next tag if it exists. - if ( false === $this->parse_next_tag() ) { - $this->bytes_already_parsed = strlen( $this->html ); - + if ( false === $this->next_token() ) { return false; } - // Parse all of its attributes. - while ( $this->parse_next_attribute() ) { + if ( self::ELEMENT_NODE !== $this->last_token_type ) { continue; } - // Ensure that the tag closes before the end of the document. - if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { - return false; - } - - $tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed ); - if ( false === $tag_ends_at ) { - return false; - } - $this->tag_ends_at = $tag_ends_at; - $this->bytes_already_parsed = min( strlen( $this->html ) - 1, $tag_ends_at + 1 ); - // Finally, check if the parsed tag and its attributes match the search query. if ( $this->matches() ) { ++$already_found; } + } while ( $already_found < $this->sought_match_offset ); - /* - * For non-DATA sections which might contain text that looks like HTML tags but - * isn't, scan with the appropriate alternative mode. Looking at the first letter - * of the tag name as a pre-check avoids a string allocation when it's not needed. - */ - $t = $this->html[ $this->tag_name_starts_at ]; - if ( - ! $this->is_closing_tag && - ( - 'i' === $t || 'I' === $t || - 'n' === $t || 'N' === $t || - 's' === $t || 'S' === $t || - 't' === $t || 'T' === $t - ) ) { - $tag_name = $this->get_tag(); - - if ( 'SCRIPT' === $tag_name && ! $this->skip_script_data() ) { - $this->bytes_already_parsed = strlen( $this->html ); - return false; - } elseif ( - ( 'TEXTAREA' === $tag_name || 'TITLE' === $tag_name ) && - ! $this->skip_rcdata( $tag_name ) - ) { - $this->bytes_already_parsed = strlen( $this->html ); - return false; - } elseif ( - ( - 'IFRAME' === $tag_name || - 'NOEMBED' === $tag_name || - 'NOFRAMES' === $tag_name || - 'NOSCRIPT' === $tag_name || - 'STYLE' === $tag_name - ) && - ! $this->skip_rawtext( $tag_name ) - ) { - /* - * "XMP" should be here too but its rules are more complicated and require the - * complexity of the HTML Processor (it needs to close out any open P element, - * meaning it can't be skipped here or else the HTML Processor will lose its - * place). For now, it can be ignored as it's a rare HTML tag in practice and - * any normative HTML should be using PRE instead. - */ - $this->bytes_already_parsed = strlen( $this->html ); - return false; - } + return true; + } + + /** + * Scans for the next token in the document. + * + * @return bool Whether a token was found before the end of the document. + */ + public function next_token() { + $was_at = $this->bytes_already_parsed; + $this->last_token_type = null; + + if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { + $this->continuation_state = self::STATE_COMPLETE; + return false; + } + + // Find the next tag if it exists. + if ( false === $this->parse_next_tag() ) { + $this->bytes_already_parsed = strlen( $this->html ); + $this->continuation_state = self::STATE_COMPLETE; + if ( $this->bytes_already_parsed > $was_at ) { + $this->last_token_type = self::TEXT_NODE; } - } while ( $already_found < $this->sought_match_offset ); + + return false; + } + + // Parse all of its attributes. + while ( $this->parse_next_attribute() ) { + continue; + } + + // Ensure that the tag closes before the end of the document. + if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { + $this->continuation_state = self::STATE_COMPLETE; + $this->last_token_type = self::TEXT_NODE; + return false; + } + + $tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed ); + if ( false === $tag_ends_at ) { + $this->continuation_state = self::STATE_COMPLETE; + $this->last_token_type = self::TEXT_NODE; + return false; + } + $this->tag_ends_at = $tag_ends_at; + $this->bytes_already_parsed = min( strlen( $this->html ) - 1, $tag_ends_at + 1 ); + $this->continuation_state = self::STATE_IN_TAG; + $this->last_token_type = self::ELEMENT_NODE; + + /* + * For non-DATA sections which might contain text that looks like HTML tags but + * isn't, scan with the appropriate alternative mode. Looking at the first letter + * of the tag name as a pre-check avoids a string allocation when it's not needed. + */ + $t = $this->html[ $this->tag_name_starts_at ]; + if ( + ! $this->is_closing_tag && + ( + 'i' === $t || 'I' === $t || + 'n' === $t || 'N' === $t || + 's' === $t || 'S' === $t || + 't' === $t || 'T' === $t + ) ) { + $tag_name = $this->get_tag(); + + if ( 'SCRIPT' === $tag_name && ! $this->skip_script_data() ) { + $this->bytes_already_parsed = strlen( $this->html ); + return false; + } elseif ( + ( 'TEXTAREA' === $tag_name || 'TITLE' === $tag_name ) && + ! $this->skip_rcdata( $tag_name ) + ) { + $this->bytes_already_parsed = strlen( $this->html ); + return false; + } elseif ( + ( + 'IFRAME' === $tag_name || + 'NOEMBED' === $tag_name || + 'NOFRAMES' === $tag_name || + 'NOSCRIPT' === $tag_name || + 'STYLE' === $tag_name + ) && + ! $this->skip_rawtext( $tag_name ) + ) { + /* + * "XMP" should be here too but its rules are more complicated and require the + * complexity of the HTML Processor (it needs to close out any open P element, + * meaning it can't be skipped here or else the HTML Processor will lose its + * place). For now, it can be ignored as it's a rare HTML tag in practice and + * any normative HTML should be using PRE instead. + */ + $this->bytes_already_parsed = strlen( $this->html ); + return false; + } + } return true; } @@ -1108,205 +1139,211 @@ private function skip_script_data() { private function parse_next_tag() { $this->after_tag(); - $html = $this->html; - $doc_length = strlen( $html ); - $at = $this->bytes_already_parsed; + $html = $this->html; + $at = $this->bytes_already_parsed; + if ( $at >= strlen( $this->html ) ) { + $this->continuation_state = self::STATE_COMPLETE; + return false; + } - while ( false !== $at && $at < $doc_length ) { - $at = strpos( $html, '<', $at ); - if ( false === $at ) { - return false; - } + $at = strpos( $html, '<', $at ); + if ( false === $at ) { + $this->continuation_state = self::STATE_COMPLETE; + $this->last_token_type = self::TEXT_NODE; + return false; + } - if ( '/' === $this->html[ $at + 1 ] ) { - $this->is_closing_tag = true; - ++$at; - } else { - $this->is_closing_tag = false; - } + if ( '/' === $this->html[ $at + 1 ] ) { + $this->is_closing_tag = true; + ++$at; + } else { + $this->is_closing_tag = false; + } - /* - * HTML tag names must start with [a-zA-Z] otherwise they are not tags. - * For example, "<3" is rendered as text, not a tag opener. If at least - * one letter follows the "<" then _it is_ a tag, but if the following - * character is anything else it _is not a tag_. - * - * It's not uncommon to find non-tags starting with `<` in an HTML - * document, so it's good for performance to make this pre-check before - * continuing to attempt to parse a tag name. - * - * Reference: - * * https://html.spec.whatwg.org/multipage/parsing.html#data-state - * * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state - */ - $tag_name_prefix_length = strspn( $html, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', $at + 1 ); - if ( $tag_name_prefix_length > 0 ) { - ++$at; - $this->tag_name_length = $tag_name_prefix_length + strcspn( $html, " \t\f\r\n/>", $at + $tag_name_prefix_length ); - $this->tag_name_starts_at = $at; - $this->bytes_already_parsed = $at + $this->tag_name_length; - return true; - } + /* + * HTML tag names must start with [a-zA-Z] otherwise they are not tags. + * For example, "<3" is rendered as text, not a tag opener. If at least + * one letter follows the "<" then _it is_ a tag, but if the following + * character is anything else it _is not a tag_. + * + * It's not uncommon to find non-tags starting with `<` in an HTML + * document, so it's good for performance to make this pre-check before + * continuing to attempt to parse a tag name. + * + * Reference: + * * https://html.spec.whatwg.org/multipage/parsing.html#data-state + * * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state + */ + $tag_name_prefix_length = strspn( $html, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', $at + 1 ); + if ( $tag_name_prefix_length > 0 ) { + ++$at; + $this->tag_name_length = $tag_name_prefix_length + strcspn( $html, " \t\f\r\n/>", $at + $tag_name_prefix_length ); + $this->tag_name_starts_at = $at; + $this->bytes_already_parsed = $at + $this->tag_name_length; + $this->continuation_state = self::STATE_IN_TAG; + $this->last_token_type = self::ELEMENT_NODE; + return true; + } - /* - * Abort if no tag is found before the end of - * the document. There is nothing left to parse. - */ - if ( $at + 1 >= strlen( $html ) ) { - return false; - } + /* + * Abort if no tag is found before the end of + * the document. There is nothing left to parse. + */ + if ( $at + 1 >= strlen( $html ) ) { + $this->continuation_state = self::STATE_COMPLETE; + $this->last_token_type = self::TEXT_NODE; + return false; + } + /* + * + * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state */ - if ( '!' === $html[ $at + 1 ] ) { - /* - * - * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state - */ - if ( - strlen( $html ) > $at + 3 && - '-' === $html[ $at + 2 ] && - '-' === $html[ $at + 3 ] - ) { - $closer_at = $at + 4; - // If it's not possible to close the comment then there is nothing more to scan. - if ( strlen( $html ) <= $closer_at ) { - return false; - } - - // Abruptly-closed empty comments are a sequence of dashes followed by `>`. - $span_of_dashes = strspn( $html, '-', $closer_at ); - if ( '>' === $html[ $closer_at + $span_of_dashes ] ) { - $at = $closer_at + $span_of_dashes + 1; - continue; - } + if ( + strlen( $html ) > $at + 3 && + '-' === $html[ $at + 2 ] && + '-' === $html[ $at + 3 ] + ) { + $closer_at = $at + 4; + // If it's not possible to close the comment then there is nothing more to scan. + if ( strlen( $html ) <= $closer_at ) { + return false; + } - /* - * Comments may be closed by either a --> or an invalid --!>. - * The first occurrence closes the comment. - * - * See https://html.spec.whatwg.org/#parse-error-incorrectly-closed-comment - */ - --$closer_at; // Pre-increment inside condition below reduces risk of accidental infinite looping. - while ( ++$closer_at < strlen( $html ) ) { - $closer_at = strpos( $html, '--', $closer_at ); - if ( false === $closer_at ) { - return false; - } - - if ( $closer_at + 2 < strlen( $html ) && '>' === $html[ $closer_at + 2 ] ) { - $at = $closer_at + 3; - continue 2; - } - - if ( $closer_at + 3 < strlen( $html ) && '!' === $html[ $closer_at + 2 ] && '>' === $html[ $closer_at + 3 ] ) { - $at = $closer_at + 4; - continue 2; - } - } + // Abruptly-closed empty comments are a sequence of dashes followed by `>`. + $span_of_dashes = strspn( $html, '-', $closer_at ); + if ( '>' === $html[ $closer_at + $span_of_dashes ] ) { + $at = $closer_at + $span_of_dashes + 1; + return true; } /* - * - * The CDATA is case-sensitive. - * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state + * Comments may be closed by either a --> or an invalid --!>. + * The first occurrence closes the comment. + * + * See https://html.spec.whatwg.org/#parse-error-incorrectly-closed-comment */ - if ( - strlen( $html ) > $at + 8 && - '[' === $html[ $at + 2 ] && - 'C' === $html[ $at + 3 ] && - 'D' === $html[ $at + 4 ] && - 'A' === $html[ $at + 5 ] && - 'T' === $html[ $at + 6 ] && - 'A' === $html[ $at + 7 ] && - '[' === $html[ $at + 8 ] - ) { - $closer_at = strpos( $html, ']]>', $at + 9 ); + --$closer_at; // Pre-increment inside condition below reduces risk of accidental infinite looping. + while ( ++$closer_at < strlen( $html ) ) { + $closer_at = strpos( $html, '--', $closer_at ); if ( false === $closer_at ) { return false; } - $at = $closer_at + 3; - continue; - } - - /* - * - * These are ASCII-case-insensitive. - * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state - */ - if ( - strlen( $html ) > $at + 8 && - ( 'D' === $html[ $at + 2 ] || 'd' === $html[ $at + 2 ] ) && - ( 'O' === $html[ $at + 3 ] || 'o' === $html[ $at + 3 ] ) && - ( 'C' === $html[ $at + 4 ] || 'c' === $html[ $at + 4 ] ) && - ( 'T' === $html[ $at + 5 ] || 't' === $html[ $at + 5 ] ) && - ( 'Y' === $html[ $at + 6 ] || 'y' === $html[ $at + 6 ] ) && - ( 'P' === $html[ $at + 7 ] || 'p' === $html[ $at + 7 ] ) && - ( 'E' === $html[ $at + 8 ] || 'e' === $html[ $at + 8 ] ) - ) { - $closer_at = strpos( $html, '>', $at + 9 ); - if ( false === $closer_at ) { - return false; + if ( $closer_at + 2 < strlen( $html ) && '>' === $html[ $closer_at + 2 ] ) { + $at = $closer_at + 3; + return true; } - $at = $closer_at + 1; - continue; + if ( $closer_at + 3 < strlen( $html ) && '!' === $html[ $closer_at + 2 ] && '>' === $html[ $closer_at + 3 ] ) { + $at = $closer_at + 4; + return true; + } } - - /* - * Anything else here is an incorrectly-opened comment and transitions - * to the bogus comment state - skip to the nearest >. - */ - $at = strpos( $html, '>', $at + 1 ); - continue; - } - - /* - * is a missing end tag name, which is ignored. - * - * See https://html.spec.whatwg.org/#parse-error-missing-end-tag-name - */ - if ( '>' === $html[ $at + 1 ] ) { - ++$at; - continue; } /* - * - * See https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state + * + * The CDATA is case-sensitive. + * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state */ - if ( '?' === $html[ $at + 1 ] ) { - $closer_at = strpos( $html, '>', $at + 2 ); + if ( + strlen( $html ) > $at + 8 && + '[' === $html[ $at + 2 ] && + 'C' === $html[ $at + 3 ] && + 'D' === $html[ $at + 4 ] && + 'A' === $html[ $at + 5 ] && + 'T' === $html[ $at + 6 ] && + 'A' === $html[ $at + 7 ] && + '[' === $html[ $at + 8 ] + ) { + $closer_at = strpos( $html, ']]>', $at + 9 ); if ( false === $closer_at ) { return false; } - $at = $closer_at + 1; - continue; + $at = $closer_at + 3; + return true; } /* - * If a non-alpha starts the tag name in a tag closer it's a comment. - * Find the first `>`, which closes the comment. - * - * See https://html.spec.whatwg.org/#parse-error-invalid-first-character-of-tag-name + * + * These are ASCII-case-insensitive. + * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state */ - if ( $this->is_closing_tag ) { - $closer_at = strpos( $html, '>', $at + 3 ); + if ( + strlen( $html ) > $at + 8 && + ( 'D' === $html[ $at + 2 ] || 'd' === $html[ $at + 2 ] ) && + ( 'O' === $html[ $at + 3 ] || 'o' === $html[ $at + 3 ] ) && + ( 'C' === $html[ $at + 4 ] || 'c' === $html[ $at + 4 ] ) && + ( 'T' === $html[ $at + 5 ] || 't' === $html[ $at + 5 ] ) && + ( 'Y' === $html[ $at + 6 ] || 'y' === $html[ $at + 6 ] ) && + ( 'P' === $html[ $at + 7 ] || 'p' === $html[ $at + 7 ] ) && + ( 'E' === $html[ $at + 8 ] || 'e' === $html[ $at + 8 ] ) + ) { + $closer_at = strpos( $html, '>', $at + 9 ); if ( false === $closer_at ) { return false; } $at = $closer_at + 1; - continue; + return true; } + /* + * Anything else here is an incorrectly-opened comment and transitions + * to the bogus comment state - skip to the nearest >. + */ + $at = strpos( $html, '>', $at + 1 ); + return true; + } + + /* + * is a missing end tag name, which is ignored. + * + * See https://html.spec.whatwg.org/#parse-error-missing-end-tag-name + */ + if ( '>' === $html[ $at + 1 ] ) { ++$at; + return true; + } + + /* + * + * See https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state + */ + if ( '?' === $html[ $at + 1 ] ) { + $closer_at = strpos( $html, '>', $at + 2 ); + if ( false === $closer_at ) { + return false; + } + + $at = $closer_at + 1; + return true; + } + + /* + * If a non-alpha starts the tag name in a tag closer it's a comment. + * Find the first `>`, which closes the comment. + * + * See https://html.spec.whatwg.org/#parse-error-invalid-first-character-of-tag-name + */ + if ( $this->is_closing_tag ) { + $closer_at = strpos( $html, '>', $at + 3 ); + if ( false === $closer_at ) { + return false; + } + + $at = $closer_at + 1; + return true; } + ++$a; return false; }