From 2051d388a9e3975231de9e1fe130ae3eb8b3c65c Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 12 Jan 2024 07:53:28 -0500 Subject: [PATCH] Provisionarily: add back CDATA and PI nodes --- .../html-api/class-wp-html-tag-processor.php | 104 +++++++++++++++++- 1 file changed, 103 insertions(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 9eb6340594606..54a956695f2fc 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1664,6 +1664,24 @@ private function parse_next_tag() { $this->text_starts_at = $this->token_starts_at + 2; $this->text_length = $closer_at - $this->text_starts_at; $this->bytes_already_parsed = $closer_at + 1; + + // Identify nodes that would be CDATA if HTML had CDATA sections. + if ( + $this->token_length >= 10 && + '[' === $html[ $this->token_starts_at + 2 ] && + 'C' === $html[ $this->token_starts_at + 3 ] && + 'D' === $html[ $this->token_starts_at + 4 ] && + 'A' === $html[ $this->token_starts_at + 5 ] && + 'T' === $html[ $this->token_starts_at + 6 ] && + 'A' === $html[ $this->token_starts_at + 7 ] && + '[' === $html[ $this->token_starts_at + 8 ] && + ']' === $html[ $closer_at - 1 ] + ) { + $this->parser_state = self::STATE_CDATA_NODE; + $this->text_starts_at += 7; + $this->text_length -= 9; + } + return true; } @@ -1700,6 +1718,41 @@ private function parse_next_tag() { $this->text_starts_at = $this->token_starts_at + 2; $this->text_length = $closer_at - $this->text_starts_at; $this->bytes_already_parsed = $closer_at + 1; + + /* + * Identify a Processing Instruction node were HTML to have them. + * + * XML allows for more target names, but this code only identifies + * a subset. This is more or less okay because ultimately these are + * HTML comments in the DOM and this safely supports _some_ kinds + * of PI Nodes without getting lost while parsing. + * + * This code identifies processing instruction nodes whose target + * name can be represented in single-byte UTF-8 / 7-bit ASCII. + * + * > NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | + * [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | + * [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | + * [#x10000-#xEFFFF] + * > NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] + * + * @see https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget + */ + if ( $this->token_length >= 5 && '?' === $html[ $closer_at - 1 ] ) { + $comment_text = substr( $html, $this->token_starts_at + 2, $this->token_length - 4 ); + $pi_target_length = strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ:_' ); + + if ( 0 < $pi_target_length ) { + $pi_target_length += strspn( $comment_text,'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789:_-.', $pi_target_length ); + + $this->parser_state = self::STATE_PI_NODE; + $this->tag_name_starts_at = $this->token_starts_at + 2; + $this->tag_name_length = $pi_target_length; + $this->text_starts_at += $pi_target_length; + $this->text_length -= $pi_target_length + 1; + } + } + return true; } @@ -2507,6 +2560,9 @@ public function get_token_type() { case self::STATE_DOCTYPE: return '#doctype'; + case self::STATE_PI_NODE: + return '#processing-instruction'; + default: return $this->get_token_name(); } @@ -2540,6 +2596,12 @@ public function get_token_name() { case self::STATE_TEXT_NODE: return '#text'; + case self::STATE_CDATA_NODE: + return '#cdata-section'; + + case self::STATE_PI_NODE: + return substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length ); + case self::STATE_COMMENT: return '#comment'; @@ -2580,7 +2642,15 @@ public function get_modifiable_text() { $at = $this->text_starts_at; $length = $this->text_length; $text = substr( $this->html, $at, $length ); - $text = html_entity_decode( $text, ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE ); + + if ( + self::STATE_CDATA_NODE === $this->parser_state || + self::STATE_PI_NODE === $this->parser_state + ) { + return $text; + } + + $text = html_entity_decode( $text, ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE ); if ( empty( $text ) ) { return ''; @@ -3135,6 +3205,38 @@ private function matches() { */ const STATE_TEXT_NODE = 'STATE_TEXT_NODE'; + /** + * Parser CDATA Node State. + * + * Indicates that the parser has found a CDADA node and it's possible + * to read and modify its modifiable text. Note that in HTML there are + * no CDATA nodes outside foreign elements (SVG and MathML). Outside + * of foreign elements, they are treated as HTML comments. Nonetheless, + * the Tag Processor still recognizes them as they appear in the HTML + * stream and exposes them for inspection and modification. + * + * @since 6.5.0 + * + * @access private + */ + const STATE_CDATA_NODE = 'STATE_CDATA_NODE'; + + /** + * Parser Processing Instruction State. + * + * Indicates that the parser has found a Processing Instruction and + * it's possible to read and modify its modifiable text. Note that in + * HTML there are no Processing Instruction nodes and they are treated + * as HTML comments. Nonetheless, the Tag Processor still recognizes + * them as they appear in the HTML stream and exposes them for + * inspection and modification. + * + * @since 6.5.0 + * + * @access private + */ + const STATE_PI_NODE = 'STATE_PI_NODE'; + /** * Indicates that the parser has found an HTML comment and it's * possible to read and modify its modifiable text.