Update to WP_Token_Map

WordPress · Apr 13, 2024 · 18f481f · 18f481f
1 parent 43be97a
commit 18f481f
Show file tree

Hide file tree

Showing 5 changed files with 433 additions and 1,685 deletions.
diff --git a/src/wp-includes/class-wp-token-map.php b/src/wp-includes/class-wp-token-map.php
@@ -0,0 +1,299 @@
+<?php
+
+class WP_Token_Map {
+	const MAX_LENGTH = 256;
+
+	private $key_length = 2;
+
+	/**
+	 * Stores an optimized form of the word set, where words are grouped
+	 * by first two letters and then collapsed into a string.
+	 *
+	 * @var array
+	 */
+	private $large_words = array();
+
+	/**
+	 * Stores an optimized row of short words, where every entry is two
+	 * bytes long and zero-extended if the word is only a single byte.
+	 *
+	 * @var string
+	 */
+	private $small_words = '';
+
+	/**
+	 * Holds mapping according to the index in the small words.
+	 *
+	 * @var string[]
+	 */
+	private $small_mappings = array();
+
+	public static function from_array( $mappings, $key_length = 2 ) {
+		$map             = new WP_Token_Map();
+		$map->key_length = $key_length;
+
+		// Start by grouping words.
+
+		$groups = array();
+		$shorts = array();
+		foreach ( $mappings as $word => $mapping ) {
+			if ( ! is_string( $word ) || self::MAX_LENGTH <= strlen( $word ) ) {
+				return null;
+			}
+
+			$length = strlen( $word );
+
+			if ( $key_length >= $length ) {
+				$shorts[] = $word;
+			} else {
+				$group = substr( $word, 0, $key_length );
+
+				if ( ! isset( $groups[ $group ] ) ) {
+					$groups[ $group ] = array();
+				}
+
+				$groups[ $group ][] = array( substr( $word, $key_length ), $mapping );
+			}
+		}
+
+		// Sort the words by longest-first, then alphabetical.
+
+		usort( $shorts, array( self::class, 'longest_first_then_alphabetical' ) );
+		foreach ( $groups as $group_key => $group ) {
+			usort( $groups[ $group_key ], array( self::class, 'longest_first_then_alphabetical' ) );
+		}
+
+		// Finally construct the optimized lookups.
+
+		foreach ( $shorts as $word ) {
+			$map->small_words     .= str_pad( $word, $key_length, "\x00" );
+			$map->small_mappings[] = $mapping;
+		}
+
+		foreach ( $groups as $group => $group_words ) {
+			$group_string = '';
+
+			foreach ( $group_words as $group_word ) {
+				list( $word, $mapping ) = $group_word;
+
+				$group_string .= pack( 'C', strlen( $word ) ) . $word . pack( 'C', strlen( $mapping ) ) . $mapping;
+			}
+
+			$map->large_words[ $group ] = $group_string;
+		}
+
+		return $map;
+	}
+
+	public static function from_precomputed_table( $key_length, $large_words, $small_words, $small_mappings ) {
+		$map = new WP_Token_Map();
+
+		$map->key_length     = $key_length;
+		$map->large_words    = $large_words;
+		$map->small_words    = $small_words;
+		$map->small_mappings = $small_mappings;
+
+		return $map;
+	}
+
+	public function contains( $word ) {
+		if ( $this->key_length >= strlen( $word ) ) {
+			$word_at = strpos( $this->small_words, str_pad( $word, $this->key_length, "\x00" ) );
+			if ( false === $word_at ) {
+				return false;
+			}
+
+			return $this->small_mappings[ $word_at / $this->key_length ];
+		}
+
+		$group_key = substr( $word, 0, $this->key_length );
+		if ( ! isset( $this->large_words[ $group_key ] ) ) {
+			return false;
+		}
+
+		$group  = $this->large_words[ $group_key ];
+		$slug   = substr( $word, $this->key_length );
+		$length = strlen( $slug );
+		$at     = 0;
+		while ( $at < strlen( $group ) ) {
+			$token_length   = unpack( 'C', $group[ $at++ ] )[1];
+			$token_at       = $at;
+			$at            += $token_length;
+			$mapping_length = unpack( 'C', $group[ $at++ ] )[1];
+			$mapping_at     = $at;
+
+			if ( $token_length === $length && 0 === substr_compare( $group, $slug, $token_at, $token_length ) ) {
+				return substr( $group, $mapping_at, $mapping_length );
+			}
+
+			$at = $mapping_at + $mapping_length;
+		}
+
+		return false;
+	}
+
+	public function read_token( $text, $offset, &$skip_bytes ) {
+		$text_length = strlen( $text );
+
+		// Search for a long word first, if the text is long enough, and if that fails, a short one.
+		if ( $this->key_length < $text_length ) {
+			$group_key = substr( $text, $offset, $this->key_length );
+
+			if ( ! isset( $this->large_words[ $group_key ] ) ) {
+				return false;
+			}
+
+			$group        = $this->large_words[ $group_key ];
+			$group_length = strlen( $group );
+			$at           = 0;
+			while ( $at < $group_length ) {
+				$token_length   = unpack( 'C', $group[ $at++ ] )[1];
+				$token          = substr( $group, $at, $token_length );
+				$at            += $token_length;
+				$mapping_length = unpack( 'C', $group[ $at++ ] )[1];
+				$mapping_at     = $at;
+
+				if ( 0 === substr_compare( $text, $token, $offset + $this->key_length, $token_length ) ) {
+					$skip_bytes = $this->key_length + $token_length;
+					return substr( $group, $mapping_at, $mapping_length );
+				}
+
+				$at = $mapping_at + $mapping_length;
+			}
+		}
+
+		// Perhaps a short word then.
+		$small_text = str_pad( substr( $text, $offset, $this->key_length ), $this->key_length, "\x00" );
+		$at         = strpos( $this->small_words, $small_text );
+
+		if ( false === $at ) {
+			return false;
+		}
+
+		$skip_bytes = strlen( trim( $small_text, "\x00" ) );
+		return $this->small_mappings[ $at / $this->key_length ];
+	}
+
+	public function to_array() {
+		$tokens = array();
+
+		$at            = 0;
+		$small_mapping = 0;
+		while ( $at < strlen( $this->small_words ) ) {
+			$token = array();
+
+			$token[]  = rtrim( substr( $this->small_words, $at, $this->key_length ), "\x00" );
+			$token[]  = $this->small_mappings[ $small_mapping++ ];
+			$tokens[] = $token;
+
+			$at += $this->key_length;
+		}
+
+		foreach ( $this->large_words as $prefix => $group ) {
+			$at = 0;
+			while ( $at < strlen( $group ) ) {
+				$token = array();
+
+				$length  = unpack( 'C', $group[ $at++ ] )[1];
+				$token[] = $prefix . substr( $group, $at, $length );
+
+				$at     += $length;
+				$length  = unpack( 'C', $group[ $at++ ] )[1];
+				$token[] = substr( $group, $at, $length );
+
+				$tokens[] = $token;
+				$at      += $length;
+			}
+		}
+
+		return $tokens;
+	}
+
+	public function precomputed_php_source_table( $indent = "\t" ) {
+		$i1 = $indent;
+		$i2 = $indent . $indent;
+
+		$output  = self::class . "::from_precomputed_table(\n";
+		$output .= "{$i1}{$this->key_length},\n";
+		$output .= "{$i1}array(\n";
+
+		$prefixes = array_keys( $this->large_words );
+		sort( $prefixes );
+		foreach ( $prefixes as $prefix ) {
+			$group        = $this->large_words[ $prefix ];
+			$comment_line = "{$i2}//";
+			$data_line    = "{$i2}'{$prefix}' => \"";
+			$at           = 0;
+			while ( $at < strlen( $group ) ) {
+				$token_length   = unpack( 'C', $group[ $at++ ] )[1];
+				$token          = substr( $group, $at, $token_length );
+				$at            += $token_length;
+				$mapping_length = unpack( 'C', $group[ $at++ ] )[1];
+				$mapping        = substr( $group, $at, $mapping_length );
+				$at            += $mapping_length;
+
+				$token_digits   = str_pad( dechex( $token_length ), 2, '0', STR_PAD_LEFT );
+				$mapping_digits = str_pad( dechex( $mapping_length ), 2, '0', STR_PAD_LEFT );
+
+				$mapping = preg_replace_callback(
+					"~[\x00-\x1f\"]~",
+					static function ( $match ) {
+						if ( '"' === $match[0] ) {
+							return '\\"';
+						}
+						$hex = dechex( ord( $match[0] ) );
+						return "\\x{$hex}";
+					},
+					$mapping
+				);
+
+				$comment_line .= " {$prefix}{$token}[{$mapping}]";
+				$data_line    .= "\\x{$token_digits}{$token}\\x{$mapping_digits}{$mapping}";
+			}
+			$comment_line .= "\n";
+			$data_line    .= "\",\n";
+
+			$output .= $comment_line;
+			$output .= $data_line;
+		}
+
+		$output .= "{$i1}),\n";
+
+		$small_words   = array();
+		$at            = 0;
+		while ( $at < strlen( $this->small_words ) ) {
+			$small_words[] = substr( $this->small_words, $at, $this->key_length );
+			$at           += $this->key_length;
+		}
+//		sort( $small_words );
+
+		$small_text = str_replace( "\x00", '\x00', implode( '', $small_words ) );
+		$output    .= "{$i1}\"{$small_text}\",\n";
+
+		$output .= "{$i1}array(\n";
+		foreach ( $this->small_mappings as $mapping ) {
+			$output .= "{$i2}\"{$mapping}\",\n";
+		}
+		$output .= "{$i1})\n";
+
+		$output    .= ");\n";
+
+		return $output;
+	}
+
+	private static function longest_first_then_alphabetical( $a, $b ) {
+		if ( $a[0] === $b[0] ) {
+			return 0;
+		}
+
+		$la = strlen( $a[0] );
+		$lb = strlen( $b[0] );
+
+		// Longer strings are less-than for comparison's sake.
+		if ( $la !== $lb ) {
+			return $lb - $la;
+		}
+
+		return strcmp( $a[0], $b[0] );
+	}
+}
diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php
@@ -101,8 +101,9 @@ public static function read_character_reference( $text, $at, $allow_ambiguous_am
 			}
 
 			// Cannot encode invalid Unicode code points. Max is to U+10FFFF.
-			$digit_count   = strspn( $text, $numeric_digits, $digits_at );
-			$after_digits  = $digits_at + $digit_count;
+			$zero_count    = strspn( $text, '0', $digits_at );
+			$digit_count   = strspn( $text, $numeric_digits, $digits_at + $zero_count );
+			$after_digits  = $digits_at + $zero_count + $digit_count;
 			$has_semicolon = $after_digits < $length && ';' === $text[ $after_digits ];
 			$end_of_span   = $has_semicolon ? $after_digits + 1 : $after_digits;
 
@@ -111,12 +112,12 @@ public static function read_character_reference( $text, $at, $allow_ambiguous_am
 				return null;
 			}
 
-			if ( $digit_count > $max_digits ) {
+			if ( $digit_count - $zero_count > $max_digits ) {
 				$skip_bytes = $end_of_span - $at;
 				return '�';
 			}
 
-			$digits     = substr( $text, $digits_at, $digit_count );
+			$digits     = substr( $text, $digits_at + $zero_count, $digit_count );
 			$code_point = intval( $digits, $numeric_base );
 
 			if (
@@ -129,7 +130,18 @@ public static function read_character_reference( $text, $at, $allow_ambiguous_am
 				// Surrogate.
 				( $code_point >= 0xD800 && $code_point <= 0xDFFF ) ||
 
-				// Noncharacters.
+				/*
+				 * Noncharacters.
+				 *
+				 * > A noncharacter is a code point that is in the range U+FDD0 to U+FDEF,
+				 * > inclusive, or U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF,
+				 * > U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE,
+				 * > U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF,
+				 * > U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE,
+				 * > U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, or U+10FFFF.
+				 *
+				 * @see https://infra.spec.whatwg.org/#noncharacter
+				 */
 				( $code_point >= 0xFDD0 && $code_point <= 0xFDEF ) ||
 				( 0xFFFE === ( $code_point & 0xFFFE ) ) ||
 
@@ -204,22 +216,18 @@ public static function read_character_reference( $text, $at, $allow_ambiguous_am
 			return null;
 		}
 
-		// Advance past the `&`.
-		++$name_at;
-
-		$name = $html5_named_character_entity_set->read_token( $text, $name_at );
+		$name = $html5_named_character_entity_set->read_token( $text, $name_at, $name_length );
 		if ( false === $name ) {
 			return null;
 		}
 
-		$name_length = strlen( $name );
-		$after_name  = $name_at + $name_length;
+		$after_name = $name_at + $name_length;
 
 		// If we have an un-ambiguous ampersand we can safely leave it in.
-		if ( ';' === $name[ $name_length - 1 ] ) {
+		if ( ';' === $text[ $name_at + $name_length - 1 ] ) {
 			$skip_bytes = $after_name - $at;
 			// @todo bring back the WP_Token_Map so we can decode these.
-			return html_entity_decode( "&{$name}" );
+			return $name;
 		}
 
 		/*
@@ -240,15 +248,15 @@ public static function read_character_reference( $text, $at, $allow_ambiguous_am
 		if ( ! $ambiguous_follower ) {
 			$skip_bytes = $after_name - $at;
 			// @todo Bring back WP_Token_Map to replace properly.
-			return html_entity_decode( "&{$name};" );
+			return $name;
 		}
 
 		if ( ! $allow_ambiguous_ampersand ) {
 			return null;
 		}
 
 		$skip_bytes = $after_name - $at;
-		return html_entity_decode( "&{$name};" );
+		return $name;
 	}
 
 	public static function code_point_to_utf8_bytes( $code_point ) {