Treat special tags as tokens with modifiable text

WordPress · Nov 29, 2023 · dddef69 · dddef69
1 parent d568152
commit dddef69
Showing 1 changed file with 143 additions and 62 deletions.
diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php
@@ -378,6 +378,34 @@ class WP_HTML_Tag_Processor {
 	 */
 	private $is_closing_tag;
 
+	/**
+	 * Byte offset into document at which point the current token starts.
+	 *
+	 * @var int
+	 */
+	private $token_starts_at = 0;
+
+	/**
+	 * Number of bytes in current token.
+	 *
+	 * @var int
+	 */
+	private $token_length = 0;
+
+	/**
+	 * Byte offset into document at which point text segment inside token starts, or null if none.
+	 *
+	 * @var int
+	 */
+	private $text_starts_at = null;
+
+	/**
+	 * Number of bytes in text segment inside token, or null if none.
+	 *
+	 * @var int
+	 */
+	private $text_length = null;
+
 	/**
 	 * What kind of node was parsed in the last step while scanning through the document,
 	 * or if the parser hasn't paused on a matched token, then `null`.
@@ -567,8 +595,6 @@ public function next_tag( $query = null ) {
 
 		do {
 			if ( false === $this->next_token() ) {
-				$this->continuation_state   = self::STATE_COMPLETE;
-				$this->bytes_already_parsed = strlen( $this->html );
 				return false;
 			}
 
@@ -593,69 +619,33 @@ public function next_tag( $query = null ) {
 	public function next_token() {
 		$was_at = $this->bytes_already_parsed;
 
-		if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
-			$this->continuation_state   = self::STATE_COMPLETE;
-			$this->bytes_already_parsed = strlen( $this->html );
-			$this->last_token_type      = null;
+		if ( $was_at >= strlen( $this->html ) ) {
+			$this->continuation_state = self::STATE_COMPLETE;
 			return false;
 		}
 
-		switch ( $this->continuation_state ) {
-			case self::STATE_RCDATA:
-				if ( ! $this->skip_rcdata( $this->get_tag() ) ) {
-					$this->continuation_state   = self::STATE_COMPLETE;
-					$this->bytes_already_parsed = strlen( $this->html );
-					return false;
-				}
-
-				$this->last_token_type    = self::TEXT_NODE;
-				$this->continuation_state = self::STATE_IN_TAG;
-				$this->tag_name_starts_at = $was_at;
-				$this->tag_name_length    = $this->bytes_already_parsed - $was_at;
-				return true;
-
-			case self::STATE_SCRIPT_RCDATA:
-				if ( ! $this->skip_script_data() ) {
-					$this->continuation_state   = self::STATE_COMPLETE;
-					$this->bytes_already_parsed = strlen( $this->html );
-					return false;
-				}
-
-				$this->last_token_type    = self::TEXT_NODE;
-				$this->continuation_state = self::STATE_IN_TAG;
-				$this->tag_name_starts_at = $was_at;
-				$this->tag_name_length    = $this->bytes_already_parsed - $was_at;
-				return true;
-
-			case self::STATE_RAWTEXT:
-				if ( ! $this->skip_rawtext( $this->get_tag() ) ) {
-					$this->continuation_state = self::STATE_COMPLETE;
-					$this->bytes_already_parsed = strlen( $this->html );
-					return false;
-				}
-
-				$this->last_token_type    = self::TEXT_NODE;
-				$this->continuation_state = self::STATE_IN_TAG;
-				$this->tag_name_starts_at = $was_at;
-				$this->tag_name_length    = $this->bytes_already_parsed - $was_at;
-				return true;
+		/*
+		 * There is no use continuing to parse if the document ends in the middle
+		 * of a syntax element. It's necessary to append the next chunk of HTML
+		 * and then to start parsing again where the partial token started.
+		 */
+		if ( self::STATE_INCOMPLETE === $this->continuation_state ) {
+			return false;
 		}
 
 		$this->last_token_type = null;
 
 		// Find the next tag if it exists.
 		if ( false === $this->parse_next_tag() ) {
 			if ( $this->bytes_already_parsed > $was_at ) {
+				$this->continuation_state = self::STATE_COMPLETE;
 				$this->last_token_type    = self::TEXT_NODE;
 				$this->tag_name_starts_at = $was_at;
 				$this->tag_name_length    = strlen( $this->html ) - $was_at;
-				$this->continuation_state = self::STATE_COMPLETE;
 				return true;
 			}
 
-			$this->bytes_already_parsed = strlen( $this->html );
-			$this->continuation_state   = self::STATE_COMPLETE;
-
+			$this->continuation_state   = self::STATE_INCOMPLETE;
 			return false;
 		}
 
@@ -670,18 +660,16 @@ public function next_token() {
 
 		// Ensure that the tag closes before the end of the document.
 		if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
-			$this->continuation_state = self::STATE_COMPLETE;
-			$this->last_token_type    = self::TEXT_NODE;
+			$this->continuation_state = self::STATE_INCOMPLETE;
 			return false;
 		}
 
 		$tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed );
 		if ( false === $tag_ends_at ) {
-			$this->continuation_state   = self::STATE_COMPLETE;
-			$this->last_token_type      = self::TEXT_NODE;
-			$this->bytes_already_parsed = strlen( $this->html );
+			$this->continuation_state   = self::STATE_INCOMPLETE;
 			return false;
 		}
+
 		$this->tag_ends_at          = $tag_ends_at;
 		$this->bytes_already_parsed = min( strlen( $this->html ) - 1, $tag_ends_at + 1 );
 		$this->continuation_state   = self::STATE_IN_TAG;
@@ -703,11 +691,56 @@ public function next_token() {
 			) ) {
 			$tag_name = $this->get_tag();
 
+			// @todo: Move into a separate functions to handle these specific cases.
 			if ( 'SCRIPT' === $tag_name && ! $this->is_closing_tag ) {
-				$this->continuation_state = self::STATE_SCRIPT_RCDATA;
+				$tnsa = $this->tag_name_starts_at;
+				$tnl  = $this->tag_name_length;
+				$tea  = $this->tag_ends_at;
+				$this->last_token_type = self::ELEMENT_NODE;
+				$this->token_starts_at = $was_at;
+				$this->text_starts_at  = $this->tag_ends_at + 1;
+				if ( ! $this->skip_script_data() ) {
+					$this->continuation_state   = self::STATE_INCOMPLETE;
+					$this->bytes_already_parsed = $was_at;
+					return false;
+				}
+				$this->text_length = $this->bytes_already_parsed - $this->text_starts_at;
+
+				// @todo: Clarify this so it's less ad-hoc. (parse_next_tag() resets after_tag()).
+				$this->bytes_already_parsed += strlen( '</SCRIPT' );
+				while ( $this->parse_next_attribute() ) {
+					continue;
+				}
+
+				$this->token_length       = $this->bytes_already_parsed - $was_at;
+				$this->tag_name_starts_at = $tnsa;
+				$this->tag_name_length    = $tnl;
+				$this->tag_ends_at        = $tea;
 				return true;
-			} elseif ('TEXTAREA' === $tag_name || 'TITLE' === $tag_name ) {
-				$this->continuation_state = self::STATE_RCDATA;
+			} else if ( ( 'TEXTAREA' === $tag_name || 'TITLE' === $tag_name ) && ! $this->is_closing_tag ) {
+				$tnsa = $this->tag_name_starts_at;
+				$tnl  = $this->tag_name_length;
+				$tea  = $this->tag_ends_at;
+				$this->last_token_type = self::ELEMENT_NODE;
+				$this->token_starts_at = $was_at;
+				$this->text_starts_at  = $this->tag_ends_at + 1;
+				if ( ! $this->skip_rcdata( $tag_name ) ) {
+					$this->continuation_state   = self::STATE_INCOMPLETE;
+					$this->bytes_already_parsed = $was_at;
+					return false;
+				}
+				$this->text_length = $this->bytes_already_parsed - $this->text_starts_at;
+
+				// @todo: Clarify this so it's less ad-hoc. (parse_next_tag() resets after_tag()).
+				$this->bytes_already_parsed += strlen( "</{$tag_name}" );
+				while ( $this->parse_next_attribute() ) {
+					continue;
+				}
+
+				$this->token_length       = $this->bytes_already_parsed - $was_at;
+				$this->tag_name_starts_at = $tnsa;
+				$this->tag_name_length    = $tnl;
+				$this->tag_ends_at        = $tea;
 				return true;
 			} elseif (
 				(
@@ -716,7 +749,8 @@ public function next_token() {
 					'NOFRAMES' === $tag_name ||
 					'NOSCRIPT' === $tag_name ||
 					'STYLE' === $tag_name
-				)
+				) &&
+				! $this->is_closing_tag
 			) {
 				/*
 				 * "XMP" should be here too but its rules are more complicated and require the
@@ -725,7 +759,29 @@ public function next_token() {
 				 * place). For now, it can be ignored as it's a rare HTML tag in practice and
 				 * any normative HTML should be using PRE instead.
 				 */
-				$this->continuation_state = self::STATE_RAWTEXT;
+				$tnsa = $this->tag_name_starts_at;
+				$tnl  = $this->tag_name_length;
+				$tea  = $this->tag_ends_at;
+				$this->last_token_type = self::ELEMENT_NODE;
+				$this->token_starts_at = $was_at;
+				$this->text_starts_at  = $this->tag_ends_at + 1;
+				if ( ! $this->skip_rawtext( $tag_name ) ) {
+					$this->continuation_state   = self::STATE_INCOMPLETE;
+					$this->bytes_already_parsed = $was_at;
+					return false;
+				}
+				$this->text_length = $this->bytes_already_parsed - $this->text_starts_at;
+
+				// @todo: Clarify this so it's less ad-hoc. (parse_next_tag() resets after_tag()).
+				$this->bytes_already_parsed += strlen( "</{$tag_name}" );
+				while ( $this->parse_next_attribute() ) {
+					continue;
+				}
+
+				$this->token_length       = $this->bytes_already_parsed - $was_at;
+				$this->tag_name_starts_at = $tnsa;
+				$this->tag_name_length    = $tnl;
+				$this->tag_ends_at        = $tea;
 				return true;
 			}
 		}
@@ -1446,8 +1502,9 @@ private function parse_next_tag() {
 			}
 
 			$this->last_token_type      = self::WP_FUNKY_COMMENT_NODE;
-			$this->tag_name_starts_at   = $at;
-			$this->tag_name_length      = $closer_at - $at - 2;
+			$this->tag_name_starts_at   = $at - 1;
+			$this->tag_name_length      = $closer_at - $at - 1;
+			$this->tag_ends_at          = $closer_at;
 			$this->bytes_already_parsed = $closer_at + 1;
 			return true;
 		}
@@ -2149,7 +2206,23 @@ public function get_node_name() {
 	public function get_node_text() {
 		switch ( $this->last_token_type ) {
 			case self::ELEMENT_NODE:
-				return null;
+				switch ( $this->get_tag() ) {
+					case 'IFRAME':
+					case 'NOEMBED':
+					case 'NOFRAMES':
+					case 'NOSCRIPT':
+					case 'SCRIPT':
+					case 'STYLE':
+					case 'TITLE':
+						return substr(
+							$this->html,
+							$this->text_starts_at,
+							$this->text_length
+						);
+
+					default:
+						 return null;
+				}
 
 			case self::TEXT_NODE:
 				return substr(
@@ -2171,6 +2244,13 @@ public function get_node_text() {
 					$this->tag_name_starts_at + 4,
 					$this->tag_name_length
 				);
+
+			case self::WP_FUNKY_COMMENT_NODE:
+				return substr(
+					$this->html,
+					$this->tag_name_starts_at + 2,
+					$this->tag_name_length
+				);
 		}
 	}
 
@@ -2689,4 +2769,5 @@ private function matches() {
 	const STATE_SCRIPT_RCDATA = 'The parser has opened a SCRIPT element and needs to find the RCDATA text contained within.';
 	const STATE_RAWTEXT = 'The parser has opened a rawtext element and needs to find the RAWTEXT text contained within.';
 	const STATE_RCDATA = 'The parser has opened an #rcdata element and needs to find the RCDATA text contained within.';
+	const STATE_INCOMPLETE = 'The parser has reached the end of the document while inside a token; more is expected from the document.';
 }