From a25b57a8f818d47477e9a5421e8b7d74fa8ac36e Mon Sep 17 00:00:00 2001
From: Dennis Snell <dennis.snell@automattic.com>
Date: Fri, 17 Nov 2023 15:42:30 -0700
Subject: [PATCH] Add `next_token()` method

---
 .../html-api/class-wp-html-tag-processor.php  | 491 ++++++++++--------
 1 file changed, 264 insertions(+), 227 deletions(-)

diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php
index 4cb3092f6c0a5..14354e9040125 100644
--- a/src/wp-includes/html-api/class-wp-html-tag-processor.php
+++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php
@@ -393,7 +393,7 @@ class WP_HTML_Tag_Processor {
 	 *
 	 * @var string|null
 	 */
-	private $last_token_type = null;
+	public $last_token_type = null;
 
 	/**
 	 * In what mode the parser should resume after pausing,
@@ -566,86 +566,117 @@ public function next_tag( $query = null ) {
 		$already_found = 0;
 
 		do {
-			if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
-				return false;
-			}
-
-			// Find the next tag if it exists.
-			if ( false === $this->parse_next_tag() ) {
-				$this->bytes_already_parsed = strlen( $this->html );
-
+			if ( false === $this->next_token() ) {
 				return false;
 			}
 
-			// Parse all of its attributes.
-			while ( $this->parse_next_attribute() ) {
+			if ( self::ELEMENT_NODE !== $this->last_token_type ) {
 				continue;
 			}
 
-			// Ensure that the tag closes before the end of the document.
-			if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
-				return false;
-			}
-
-			$tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed );
-			if ( false === $tag_ends_at ) {
-				return false;
-			}
-			$this->tag_ends_at          = $tag_ends_at;
-			$this->bytes_already_parsed = min( strlen( $this->html ) - 1, $tag_ends_at + 1 );
-
 			// Finally, check if the parsed tag and its attributes match the search query.
 			if ( $this->matches() ) {
 				++$already_found;
 			}
+		} while ( $already_found < $this->sought_match_offset );
 
-			/*
-			 * For non-DATA sections which might contain text that looks like HTML tags but
-			 * isn't, scan with the appropriate alternative mode. Looking at the first letter
-			 * of the tag name as a pre-check avoids a string allocation when it's not needed.
-			 */
-			$t = $this->html[ $this->tag_name_starts_at ];
-			if (
-				! $this->is_closing_tag &&
-				(
-					'i' === $t || 'I' === $t ||
-					'n' === $t || 'N' === $t ||
-					's' === $t || 'S' === $t ||
-					't' === $t || 'T' === $t
-				) ) {
-				$tag_name = $this->get_tag();
-
-				if ( 'SCRIPT' === $tag_name && ! $this->skip_script_data() ) {
-					$this->bytes_already_parsed = strlen( $this->html );
-					return false;
-				} elseif (
-					( 'TEXTAREA' === $tag_name || 'TITLE' === $tag_name ) &&
-					! $this->skip_rcdata( $tag_name )
-				) {
-					$this->bytes_already_parsed = strlen( $this->html );
-					return false;
-				} elseif (
-					(
-						'IFRAME' === $tag_name ||
-						'NOEMBED' === $tag_name ||
-						'NOFRAMES' === $tag_name ||
-						'NOSCRIPT' === $tag_name ||
-						'STYLE' === $tag_name
-					) &&
-					! $this->skip_rawtext( $tag_name )
-				) {
-					/*
-					 * "XMP" should be here too but its rules are more complicated and require the
-					 * complexity of the HTML Processor (it needs to close out any open P element,
-					 * meaning it can't be skipped here or else the HTML Processor will lose its
-					 * place). For now, it can be ignored as it's a rare HTML tag in practice and
-					 * any normative HTML should be using PRE instead.
-					 */
-					$this->bytes_already_parsed = strlen( $this->html );
-					return false;
-				}
+		return true;
+	}
+
+	/**
+	 * Scans for the next token in the document.
+	 *
+	 * @return bool Whether a token was found before the end of the document.
+	 */
+	public function next_token() {
+		$was_at                = $this->bytes_already_parsed;
+		$this->last_token_type = null;
+
+		if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
+			$this->continuation_state = self::STATE_COMPLETE;
+			return false;
+		}
+
+		// Find the next tag if it exists.
+		if ( false === $this->parse_next_tag() ) {
+			$this->bytes_already_parsed = strlen( $this->html );
+			$this->continuation_state   = self::STATE_COMPLETE;
+			if ( $this->bytes_already_parsed > $was_at ) {
+				$this->last_token_type = self::TEXT_NODE;
 			}
-		} while ( $already_found < $this->sought_match_offset );
+
+			return false;
+		}
+
+		// Parse all of its attributes.
+		while ( $this->parse_next_attribute() ) {
+			continue;
+		}
+
+		// Ensure that the tag closes before the end of the document.
+		if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
+			$this->continuation_state = self::STATE_COMPLETE;
+			$this->last_token_type    = self::TEXT_NODE;
+			return false;
+		}
+
+		$tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed );
+		if ( false === $tag_ends_at ) {
+			$this->continuation_state = self::STATE_COMPLETE;
+			$this->last_token_type    = self::TEXT_NODE;
+			return false;
+		}
+		$this->tag_ends_at          = $tag_ends_at;
+		$this->bytes_already_parsed = min( strlen( $this->html ) - 1, $tag_ends_at + 1 );
+		$this->continuation_state   = self::STATE_IN_TAG;
+		$this->last_token_type      = self::ELEMENT_NODE;
+
+		/*
+		 * For non-DATA sections which might contain text that looks like HTML tags but
+		 * isn't, scan with the appropriate alternative mode. Looking at the first letter
+		 * of the tag name as a pre-check avoids a string allocation when it's not needed.
+		 */
+		$t = $this->html[ $this->tag_name_starts_at ];
+		if (
+			! $this->is_closing_tag &&
+			(
+				 'i' === $t || 'I' === $t ||
+				 'n' === $t || 'N' === $t ||
+				 's' === $t || 'S' === $t ||
+				 't' === $t || 'T' === $t
+			) ) {
+			$tag_name = $this->get_tag();
+
+			if ( 'SCRIPT' === $tag_name && ! $this->skip_script_data() ) {
+				 $this->bytes_already_parsed = strlen( $this->html );
+				 return false;
+			} elseif (
+				 ( 'TEXTAREA' === $tag_name || 'TITLE' === $tag_name ) &&
+				 ! $this->skip_rcdata( $tag_name )
+			) {
+				 $this->bytes_already_parsed = strlen( $this->html );
+				 return false;
+			} elseif (
+				 (
+					 'IFRAME' === $tag_name ||
+					 'NOEMBED' === $tag_name ||
+					 'NOFRAMES' === $tag_name ||
+					 'NOSCRIPT' === $tag_name ||
+					 'STYLE' === $tag_name
+				 ) &&
+				 ! $this->skip_rawtext( $tag_name )
+			) {
+				 /*
+				  * "XMP" should be here too but its rules are more complicated and require the
+				  * complexity of the HTML Processor (it needs to close out any open P element,
+				  * meaning it can't be skipped here or else the HTML Processor will lose its
+				  * place). For now, it can be ignored as it's a rare HTML tag in practice and
+				  * any normative HTML should be using PRE instead.
+				  */
+				 $this->bytes_already_parsed = strlen( $this->html );
+				 return false;
+			}
+		}
 
 		return true;
 	}
@@ -1108,205 +1139,211 @@ private function skip_script_data() {
 	private function parse_next_tag() {
 		$this->after_tag();
 
-		$html       = $this->html;
-		$doc_length = strlen( $html );
-		$at         = $this->bytes_already_parsed;
+		$html = $this->html;
+		$at   = $this->bytes_already_parsed;
+		if ( $at >= strlen( $this->html ) ) {
+			$this->continuation_state = self::STATE_COMPLETE;
+			return false;
+		}
 
-		while ( false !== $at && $at < $doc_length ) {
-			$at = strpos( $html, '<', $at );
-			if ( false === $at ) {
-				return false;
-			}
+		$at = strpos( $html, '<', $at );
+		if ( false === $at ) {
+			$this->continuation_state = self::STATE_COMPLETE;
+			$this->last_token_type    = self::TEXT_NODE;
+			return false;
+		}
 
-			if ( '/' === $this->html[ $at + 1 ] ) {
-				$this->is_closing_tag = true;
-				++$at;
-			} else {
-				$this->is_closing_tag = false;
-			}
+		if ( '/' === $this->html[ $at + 1 ] ) {
+			$this->is_closing_tag = true;
+			++$at;
+		} else {
+			$this->is_closing_tag = false;
+		}
 
-			/*
-			 * HTML tag names must start with [a-zA-Z] otherwise they are not tags.
-			 * For example, "<3" is rendered as text, not a tag opener. If at least
-			 * one letter follows the "<" then _it is_ a tag, but if the following
-			 * character is anything else it _is not a tag_.
-			 *
-			 * It's not uncommon to find non-tags starting with `<` in an HTML
-			 * document, so it's good for performance to make this pre-check before
-			 * continuing to attempt to parse a tag name.
-			 *
-			 * Reference:
-			 * * https://html.spec.whatwg.org/multipage/parsing.html#data-state
-			 * * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
-			 */
-			$tag_name_prefix_length = strspn( $html, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', $at + 1 );
-			if ( $tag_name_prefix_length > 0 ) {
-				++$at;
-				$this->tag_name_length      = $tag_name_prefix_length + strcspn( $html, " \t\f\r\n/>", $at + $tag_name_prefix_length );
-				$this->tag_name_starts_at   = $at;
-				$this->bytes_already_parsed = $at + $this->tag_name_length;
-				return true;
-			}
+		/*
+		 * HTML tag names must start with [a-zA-Z] otherwise they are not tags.
+		 * For example, "<3" is rendered as text, not a tag opener. If at least
+		 * one letter follows the "<" then _it is_ a tag, but if the following
+		 * character is anything else it _is not a tag_.
+		 *
+		 * It's not uncommon to find non-tags starting with `<` in an HTML
+		 * document, so it's good for performance to make this pre-check before
+		 * continuing to attempt to parse a tag name.
+		 *
+		 * Reference:
+		 * * https://html.spec.whatwg.org/multipage/parsing.html#data-state
+		 * * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
+		 */
+		$tag_name_prefix_length = strspn( $html, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', $at + 1 );
+		if ( $tag_name_prefix_length > 0 ) {
+			++$at;
+			$this->tag_name_length      = $tag_name_prefix_length + strcspn( $html, " \t\f\r\n/>", $at + $tag_name_prefix_length );
+			$this->tag_name_starts_at   = $at;
+			$this->bytes_already_parsed = $at + $this->tag_name_length;
+			$this->continuation_state   = self::STATE_IN_TAG;
+			$this->last_token_type      = self::ELEMENT_NODE;
+			return true;
+		}
 
-			/*
-			 * Abort if no tag is found before the end of
-			 * the document. There is nothing left to parse.
-			 */
-			if ( $at + 1 >= strlen( $html ) ) {
-				return false;
-			}
+		/*
+		 * Abort if no tag is found before the end of
+		 * the document. There is nothing left to parse.
+		 */
+		if ( $at + 1 >= strlen( $html ) ) {
+			$this->continuation_state = self::STATE_COMPLETE;
+			$this->last_token_type    = self::TEXT_NODE;
+			return false;
+		}
 
+		/*
+		 * <! transitions to markup declaration open state
+		 * https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
+		 */
+		if ( '!' === $html[ $at + 1 ] ) {
 			/*
-			 * <! transitions to markup declaration open state
-			 * https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
+			 * <!-- transitions to a bogus comment state – skip to the nearest -->
+			 * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
 			 */
-			if ( '!' === $html[ $at + 1 ] ) {
-				/*
-				 * <!-- transitions to a bogus comment state – skip to the nearest -->
-				 * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
-				 */
-				if (
-					strlen( $html ) > $at + 3 &&
-					'-' === $html[ $at + 2 ] &&
-					'-' === $html[ $at + 3 ]
-				) {
-					$closer_at = $at + 4;
-					// If it's not possible to close the comment then there is nothing more to scan.
-					if ( strlen( $html ) <= $closer_at ) {
-						return false;
-					}
-
-					// Abruptly-closed empty comments are a sequence of dashes followed by `>`.
-					$span_of_dashes = strspn( $html, '-', $closer_at );
-					if ( '>' === $html[ $closer_at + $span_of_dashes ] ) {
-						$at = $closer_at + $span_of_dashes + 1;
-						continue;
-					}
+			if (
+				 strlen( $html ) > $at + 3 &&
+				 '-' === $html[ $at + 2 ] &&
+				 '-' === $html[ $at + 3 ]
+			) {
+				$closer_at = $at + 4;
+				// If it's not possible to close the comment then there is nothing more to scan.
+				if ( strlen( $html ) <= $closer_at ) {
+					return false;
+				}
 
-					/*
-					 * Comments may be closed by either a --> or an invalid --!>.
-					 * The first occurrence closes the comment.
-					 *
-					 * See https://html.spec.whatwg.org/#parse-error-incorrectly-closed-comment
-					 */
-					--$closer_at; // Pre-increment inside condition below reduces risk of accidental infinite looping.
-					while ( ++$closer_at < strlen( $html ) ) {
-						$closer_at = strpos( $html, '--', $closer_at );
-						if ( false === $closer_at ) {
-							return false;
-						}
-
-						if ( $closer_at + 2 < strlen( $html ) && '>' === $html[ $closer_at + 2 ] ) {
-							$at = $closer_at + 3;
-							continue 2;
-						}
-
-						if ( $closer_at + 3 < strlen( $html ) && '!' === $html[ $closer_at + 2 ] && '>' === $html[ $closer_at + 3 ] ) {
-							$at = $closer_at + 4;
-							continue 2;
-						}
-					}
+				// Abruptly-closed empty comments are a sequence of dashes followed by `>`.
+				$span_of_dashes = strspn( $html, '-', $closer_at );
+				if ( '>' === $html[ $closer_at + $span_of_dashes ] ) {
+					$at = $closer_at + $span_of_dashes + 1;
+					return true;
 				}
 
 				/*
-				 * <![CDATA[ transitions to CDATA section state – skip to the nearest ]]>
-				 * The CDATA is case-sensitive.
-				 * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
+				 * Comments may be closed by either a --> or an invalid --!>.
+				 * The first occurrence closes the comment.
+				 *
+				 * See https://html.spec.whatwg.org/#parse-error-incorrectly-closed-comment
 				 */
-				if (
-					strlen( $html ) > $at + 8 &&
-					'[' === $html[ $at + 2 ] &&
-					'C' === $html[ $at + 3 ] &&
-					'D' === $html[ $at + 4 ] &&
-					'A' === $html[ $at + 5 ] &&
-					'T' === $html[ $at + 6 ] &&
-					'A' === $html[ $at + 7 ] &&
-					'[' === $html[ $at + 8 ]
-				) {
-					$closer_at = strpos( $html, ']]>', $at + 9 );
+				--$closer_at; // Pre-increment inside condition below reduces risk of accidental infinite looping.
+				while ( ++$closer_at < strlen( $html ) ) {
+					$closer_at = strpos( $html, '--', $closer_at );
 					if ( false === $closer_at ) {
 						return false;
 					}
 
-					$at = $closer_at + 3;
-					continue;
-				}
-
-				/*
-				 * <!DOCTYPE transitions to DOCTYPE state – skip to the nearest >
-				 * These are ASCII-case-insensitive.
-				 * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
-				 */
-				if (
-					strlen( $html ) > $at + 8 &&
-					( 'D' === $html[ $at + 2 ] || 'd' === $html[ $at + 2 ] ) &&
-					( 'O' === $html[ $at + 3 ] || 'o' === $html[ $at + 3 ] ) &&
-					( 'C' === $html[ $at + 4 ] || 'c' === $html[ $at + 4 ] ) &&
-					( 'T' === $html[ $at + 5 ] || 't' === $html[ $at + 5 ] ) &&
-					( 'Y' === $html[ $at + 6 ] || 'y' === $html[ $at + 6 ] ) &&
-					( 'P' === $html[ $at + 7 ] || 'p' === $html[ $at + 7 ] ) &&
-					( 'E' === $html[ $at + 8 ] || 'e' === $html[ $at + 8 ] )
-				) {
-					$closer_at = strpos( $html, '>', $at + 9 );
-					if ( false === $closer_at ) {
-						return false;
+					if ( $closer_at + 2 < strlen( $html ) && '>' === $html[ $closer_at + 2 ] ) {
+						$at = $closer_at + 3;
+						return true;
 					}
 
-					$at = $closer_at + 1;
-					continue;
+					if ( $closer_at + 3 < strlen( $html ) && '!' === $html[ $closer_at + 2 ] && '>' === $html[ $closer_at + 3 ] ) {
+						$at = $closer_at + 4;
+						return true;
+					}
 				}
-
-				/*
-				 * Anything else here is an incorrectly-opened comment and transitions
-				 * to the bogus comment state - skip to the nearest >.
-				 */
-				$at = strpos( $html, '>', $at + 1 );
-				continue;
-			}
-
-			/*
-			 * </> is a missing end tag name, which is ignored.
-			 *
-			 * See https://html.spec.whatwg.org/#parse-error-missing-end-tag-name
-			 */
-			if ( '>' === $html[ $at + 1 ] ) {
-				++$at;
-				continue;
 			}
 
 			/*
-			 * <? transitions to a bogus comment state – skip to the nearest >
-			 * See https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
+			 * <![CDATA[ transitions to CDATA section state – skip to the nearest ]]>
+			 * The CDATA is case-sensitive.
+			 * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
 			 */
-			if ( '?' === $html[ $at + 1 ] ) {
-				$closer_at = strpos( $html, '>', $at + 2 );
+			if (
+				strlen( $html ) > $at + 8 &&
+				'[' === $html[ $at + 2 ] &&
+				'C' === $html[ $at + 3 ] &&
+				'D' === $html[ $at + 4 ] &&
+				'A' === $html[ $at + 5 ] &&
+				'T' === $html[ $at + 6 ] &&
+				'A' === $html[ $at + 7 ] &&
+				'[' === $html[ $at + 8 ]
+			) {
+				$closer_at = strpos( $html, ']]>', $at + 9 );
 				if ( false === $closer_at ) {
 					return false;
 				}
 
-				$at = $closer_at + 1;
-				continue;
+				$at = $closer_at + 3;
+				return true;
 			}
 
 			/*
-			 * If a non-alpha starts the tag name in a tag closer it's a comment.
-			 * Find the first `>`, which closes the comment.
-			 *
-			 * See https://html.spec.whatwg.org/#parse-error-invalid-first-character-of-tag-name
+			 * <!DOCTYPE transitions to DOCTYPE state – skip to the nearest >
+			 * These are ASCII-case-insensitive.
+			 * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
 			 */
-			if ( $this->is_closing_tag ) {
-				$closer_at = strpos( $html, '>', $at + 3 );
+			if (
+				strlen( $html ) > $at + 8 &&
+				( 'D' === $html[ $at + 2 ] || 'd' === $html[ $at + 2 ] ) &&
+				( 'O' === $html[ $at + 3 ] || 'o' === $html[ $at + 3 ] ) &&
+				( 'C' === $html[ $at + 4 ] || 'c' === $html[ $at + 4 ] ) &&
+				( 'T' === $html[ $at + 5 ] || 't' === $html[ $at + 5 ] ) &&
+				( 'Y' === $html[ $at + 6 ] || 'y' === $html[ $at + 6 ] ) &&
+				( 'P' === $html[ $at + 7 ] || 'p' === $html[ $at + 7 ] ) &&
+				( 'E' === $html[ $at + 8 ] || 'e' === $html[ $at + 8 ] )
+			) {
+				$closer_at = strpos( $html, '>', $at + 9 );
 				if ( false === $closer_at ) {
 					return false;
 				}
 
 				$at = $closer_at + 1;
-				continue;
+				return true;
 			}
 
+			/*
+			 * Anything else here is an incorrectly-opened comment and transitions
+			 * to the bogus comment state - skip to the nearest >.
+			 */
+			$at = strpos( $html, '>', $at + 1 );
+			return true;
+		}
+
+		/*
+		 * </> is a missing end tag name, which is ignored.
+		 *
+		 * See https://html.spec.whatwg.org/#parse-error-missing-end-tag-name
+		 */
+		if ( '>' === $html[ $at + 1 ] ) {
 			++$at;
+			return true;
+		}
+
+		/*
+		 * <? transitions to a bogus comment state – skip to the nearest >
+		 * See https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
+		 */
+		if ( '?' === $html[ $at + 1 ] ) {
+			$closer_at = strpos( $html, '>', $at + 2 );
+			if ( false === $closer_at ) {
+				return false;
+			}
+
+			$at = $closer_at + 1;
+			return true;
+		}
+
+		/*
+		 * If a non-alpha starts the tag name in a tag closer it's a comment.
+		 * Find the first `>`, which closes the comment.
+		 *
+		 * See https://html.spec.whatwg.org/#parse-error-invalid-first-character-of-tag-name
+		 */
+		if ( $this->is_closing_tag ) {
+			$closer_at = strpos( $html, '>', $at + 3 );
+			if ( false === $closer_at ) {
+				return false;
+			}
+
+			$at = $closer_at + 1;
+			return true;
 		}
 
+		++$a;
 		return false;
 	}