Skip to content

Commit

Permalink
Treat special tags as tokens with modifiable text
Browse files Browse the repository at this point in the history
  • Loading branch information
dmsnell committed Nov 29, 2023
1 parent d568152 commit dddef69
Showing 1 changed file with 143 additions and 62 deletions.
205 changes: 143 additions & 62 deletions src/wp-includes/html-api/class-wp-html-tag-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,34 @@ class WP_HTML_Tag_Processor {
*/
private $is_closing_tag;

/**
* Byte offset into document at which point the current token starts.
*
* @var int
*/
private $token_starts_at = 0;

/**
* Number of bytes in current token.
*
* @var int
*/
private $token_length = 0;

/**
* Byte offset into document at which point text segment inside token starts, or null if none.
*
* @var int
*/
private $text_starts_at = null;

/**
* Number of bytes in text segment inside token, or null if none.
*
* @var int
*/
private $text_length = null;

/**
* What kind of node was parsed in the last step while scanning through the document,
* or if the parser hasn't paused on a matched token, then `null`.
Expand Down Expand Up @@ -567,8 +595,6 @@ public function next_tag( $query = null ) {

do {
if ( false === $this->next_token() ) {
$this->continuation_state = self::STATE_COMPLETE;
$this->bytes_already_parsed = strlen( $this->html );
return false;
}

Expand All @@ -593,69 +619,33 @@ public function next_tag( $query = null ) {
public function next_token() {
$was_at = $this->bytes_already_parsed;

if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
$this->continuation_state = self::STATE_COMPLETE;
$this->bytes_already_parsed = strlen( $this->html );
$this->last_token_type = null;
if ( $was_at >= strlen( $this->html ) ) {
$this->continuation_state = self::STATE_COMPLETE;
return false;
}

switch ( $this->continuation_state ) {
case self::STATE_RCDATA:
if ( ! $this->skip_rcdata( $this->get_tag() ) ) {
$this->continuation_state = self::STATE_COMPLETE;
$this->bytes_already_parsed = strlen( $this->html );
return false;
}

$this->last_token_type = self::TEXT_NODE;
$this->continuation_state = self::STATE_IN_TAG;
$this->tag_name_starts_at = $was_at;
$this->tag_name_length = $this->bytes_already_parsed - $was_at;
return true;

case self::STATE_SCRIPT_RCDATA:
if ( ! $this->skip_script_data() ) {
$this->continuation_state = self::STATE_COMPLETE;
$this->bytes_already_parsed = strlen( $this->html );
return false;
}

$this->last_token_type = self::TEXT_NODE;
$this->continuation_state = self::STATE_IN_TAG;
$this->tag_name_starts_at = $was_at;
$this->tag_name_length = $this->bytes_already_parsed - $was_at;
return true;

case self::STATE_RAWTEXT:
if ( ! $this->skip_rawtext( $this->get_tag() ) ) {
$this->continuation_state = self::STATE_COMPLETE;
$this->bytes_already_parsed = strlen( $this->html );
return false;
}

$this->last_token_type = self::TEXT_NODE;
$this->continuation_state = self::STATE_IN_TAG;
$this->tag_name_starts_at = $was_at;
$this->tag_name_length = $this->bytes_already_parsed - $was_at;
return true;
/*
* There is no use continuing to parse if the document ends in the middle
* of a syntax element. It's necessary to append the next chunk of HTML
* and then to start parsing again where the partial token started.
*/
if ( self::STATE_INCOMPLETE === $this->continuation_state ) {
return false;
}

$this->last_token_type = null;

// Find the next tag if it exists.
if ( false === $this->parse_next_tag() ) {
if ( $this->bytes_already_parsed > $was_at ) {
$this->continuation_state = self::STATE_COMPLETE;
$this->last_token_type = self::TEXT_NODE;
$this->tag_name_starts_at = $was_at;
$this->tag_name_length = strlen( $this->html ) - $was_at;
$this->continuation_state = self::STATE_COMPLETE;
return true;
}

$this->bytes_already_parsed = strlen( $this->html );
$this->continuation_state = self::STATE_COMPLETE;

$this->continuation_state = self::STATE_INCOMPLETE;
return false;
}

Expand All @@ -670,18 +660,16 @@ public function next_token() {

// Ensure that the tag closes before the end of the document.
if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
$this->continuation_state = self::STATE_COMPLETE;
$this->last_token_type = self::TEXT_NODE;
$this->continuation_state = self::STATE_INCOMPLETE;
return false;
}

$tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed );
if ( false === $tag_ends_at ) {
$this->continuation_state = self::STATE_COMPLETE;
$this->last_token_type = self::TEXT_NODE;
$this->bytes_already_parsed = strlen( $this->html );
$this->continuation_state = self::STATE_INCOMPLETE;
return false;
}

$this->tag_ends_at = $tag_ends_at;
$this->bytes_already_parsed = min( strlen( $this->html ) - 1, $tag_ends_at + 1 );
$this->continuation_state = self::STATE_IN_TAG;
Expand All @@ -703,11 +691,56 @@ public function next_token() {
) ) {
$tag_name = $this->get_tag();

// @todo: Move into a separate functions to handle these specific cases.
if ( 'SCRIPT' === $tag_name && ! $this->is_closing_tag ) {
$this->continuation_state = self::STATE_SCRIPT_RCDATA;
$tnsa = $this->tag_name_starts_at;
$tnl = $this->tag_name_length;
$tea = $this->tag_ends_at;
$this->last_token_type = self::ELEMENT_NODE;
$this->token_starts_at = $was_at;
$this->text_starts_at = $this->tag_ends_at + 1;
if ( ! $this->skip_script_data() ) {
$this->continuation_state = self::STATE_INCOMPLETE;
$this->bytes_already_parsed = $was_at;
return false;
}
$this->text_length = $this->bytes_already_parsed - $this->text_starts_at;

// @todo: Clarify this so it's less ad-hoc. (parse_next_tag() resets after_tag()).
$this->bytes_already_parsed += strlen( '</SCRIPT' );
while ( $this->parse_next_attribute() ) {
continue;
}

$this->token_length = $this->bytes_already_parsed - $was_at;
$this->tag_name_starts_at = $tnsa;
$this->tag_name_length = $tnl;
$this->tag_ends_at = $tea;
return true;
} elseif ('TEXTAREA' === $tag_name || 'TITLE' === $tag_name ) {
$this->continuation_state = self::STATE_RCDATA;
} else if ( ( 'TEXTAREA' === $tag_name || 'TITLE' === $tag_name ) && ! $this->is_closing_tag ) {
$tnsa = $this->tag_name_starts_at;
$tnl = $this->tag_name_length;
$tea = $this->tag_ends_at;
$this->last_token_type = self::ELEMENT_NODE;
$this->token_starts_at = $was_at;
$this->text_starts_at = $this->tag_ends_at + 1;
if ( ! $this->skip_rcdata( $tag_name ) ) {
$this->continuation_state = self::STATE_INCOMPLETE;
$this->bytes_already_parsed = $was_at;
return false;
}
$this->text_length = $this->bytes_already_parsed - $this->text_starts_at;

// @todo: Clarify this so it's less ad-hoc. (parse_next_tag() resets after_tag()).
$this->bytes_already_parsed += strlen( "</{$tag_name}" );
while ( $this->parse_next_attribute() ) {
continue;
}

$this->token_length = $this->bytes_already_parsed - $was_at;
$this->tag_name_starts_at = $tnsa;
$this->tag_name_length = $tnl;
$this->tag_ends_at = $tea;
return true;
} elseif (
(
Expand All @@ -716,7 +749,8 @@ public function next_token() {
'NOFRAMES' === $tag_name ||
'NOSCRIPT' === $tag_name ||
'STYLE' === $tag_name
)
) &&
! $this->is_closing_tag
) {
/*
* "XMP" should be here too but its rules are more complicated and require the
Expand All @@ -725,7 +759,29 @@ public function next_token() {
* place). For now, it can be ignored as it's a rare HTML tag in practice and
* any normative HTML should be using PRE instead.
*/
$this->continuation_state = self::STATE_RAWTEXT;
$tnsa = $this->tag_name_starts_at;
$tnl = $this->tag_name_length;
$tea = $this->tag_ends_at;
$this->last_token_type = self::ELEMENT_NODE;
$this->token_starts_at = $was_at;
$this->text_starts_at = $this->tag_ends_at + 1;
if ( ! $this->skip_rawtext( $tag_name ) ) {
$this->continuation_state = self::STATE_INCOMPLETE;
$this->bytes_already_parsed = $was_at;
return false;
}
$this->text_length = $this->bytes_already_parsed - $this->text_starts_at;

// @todo: Clarify this so it's less ad-hoc. (parse_next_tag() resets after_tag()).
$this->bytes_already_parsed += strlen( "</{$tag_name}" );
while ( $this->parse_next_attribute() ) {
continue;
}

$this->token_length = $this->bytes_already_parsed - $was_at;
$this->tag_name_starts_at = $tnsa;
$this->tag_name_length = $tnl;
$this->tag_ends_at = $tea;
return true;
}
}
Expand Down Expand Up @@ -1446,8 +1502,9 @@ private function parse_next_tag() {
}

$this->last_token_type = self::WP_FUNKY_COMMENT_NODE;
$this->tag_name_starts_at = $at;
$this->tag_name_length = $closer_at - $at - 2;
$this->tag_name_starts_at = $at - 1;
$this->tag_name_length = $closer_at - $at - 1;
$this->tag_ends_at = $closer_at;
$this->bytes_already_parsed = $closer_at + 1;
return true;
}
Expand Down Expand Up @@ -2149,7 +2206,23 @@ public function get_node_name() {
public function get_node_text() {
switch ( $this->last_token_type ) {
case self::ELEMENT_NODE:
return null;
switch ( $this->get_tag() ) {
case 'IFRAME':
case 'NOEMBED':
case 'NOFRAMES':
case 'NOSCRIPT':
case 'SCRIPT':
case 'STYLE':
case 'TITLE':
return substr(
$this->html,
$this->text_starts_at,
$this->text_length
);

default:
return null;

Check failure on line 2224 in src/wp-includes/html-api/class-wp-html-tag-processor.php

View workflow job for this annotation

GitHub Actions / PHP coding standards

Terminating statement must be indented to the same level as the CASE body
}

case self::TEXT_NODE:
return substr(
Expand All @@ -2171,6 +2244,13 @@ public function get_node_text() {
$this->tag_name_starts_at + 4,
$this->tag_name_length
);

case self::WP_FUNKY_COMMENT_NODE:
return substr(
$this->html,
$this->tag_name_starts_at + 2,
$this->tag_name_length
);
}
}

Expand Down Expand Up @@ -2689,4 +2769,5 @@ private function matches() {
const STATE_SCRIPT_RCDATA = 'The parser has opened a SCRIPT element and needs to find the RCDATA text contained within.';
const STATE_RAWTEXT = 'The parser has opened a rawtext element and needs to find the RAWTEXT text contained within.';
const STATE_RCDATA = 'The parser has opened an #rcdata element and needs to find the RCDATA text contained within.';
const STATE_INCOMPLETE = 'The parser has reached the end of the document while inside a token; more is expected from the document.';
}

0 comments on commit dddef69

Please sign in to comment.