Skip to content

Commit

Permalink
Update to WP_Token_Map
Browse files Browse the repository at this point in the history
  • Loading branch information
dmsnell committed Apr 13, 2024
1 parent 43be97a commit 18f481f
Show file tree
Hide file tree
Showing 5 changed files with 433 additions and 1,685 deletions.
299 changes: 299 additions & 0 deletions src/wp-includes/class-wp-token-map.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,299 @@
<?php

class WP_Token_Map {
const MAX_LENGTH = 256;

private $key_length = 2;

/**
* Stores an optimized form of the word set, where words are grouped
* by first two letters and then collapsed into a string.
*
* @var array
*/
private $large_words = array();

/**
* Stores an optimized row of short words, where every entry is two
* bytes long and zero-extended if the word is only a single byte.
*
* @var string
*/
private $small_words = '';

/**
* Holds mapping according to the index in the small words.
*
* @var string[]
*/
private $small_mappings = array();

public static function from_array( $mappings, $key_length = 2 ) {
$map = new WP_Token_Map();
$map->key_length = $key_length;

// Start by grouping words.

$groups = array();
$shorts = array();
foreach ( $mappings as $word => $mapping ) {
if ( ! is_string( $word ) || self::MAX_LENGTH <= strlen( $word ) ) {
return null;
}

$length = strlen( $word );

if ( $key_length >= $length ) {
$shorts[] = $word;
} else {
$group = substr( $word, 0, $key_length );

if ( ! isset( $groups[ $group ] ) ) {
$groups[ $group ] = array();
}

$groups[ $group ][] = array( substr( $word, $key_length ), $mapping );
}
}

// Sort the words by longest-first, then alphabetical.

usort( $shorts, array( self::class, 'longest_first_then_alphabetical' ) );
foreach ( $groups as $group_key => $group ) {
usort( $groups[ $group_key ], array( self::class, 'longest_first_then_alphabetical' ) );
}

// Finally construct the optimized lookups.

foreach ( $shorts as $word ) {
$map->small_words .= str_pad( $word, $key_length, "\x00" );
$map->small_mappings[] = $mapping;
}

foreach ( $groups as $group => $group_words ) {
$group_string = '';

foreach ( $group_words as $group_word ) {
list( $word, $mapping ) = $group_word;

$group_string .= pack( 'C', strlen( $word ) ) . $word . pack( 'C', strlen( $mapping ) ) . $mapping;
}

$map->large_words[ $group ] = $group_string;
}

return $map;
}

public static function from_precomputed_table( $key_length, $large_words, $small_words, $small_mappings ) {
$map = new WP_Token_Map();

$map->key_length = $key_length;
$map->large_words = $large_words;
$map->small_words = $small_words;
$map->small_mappings = $small_mappings;

return $map;
}

public function contains( $word ) {
if ( $this->key_length >= strlen( $word ) ) {
$word_at = strpos( $this->small_words, str_pad( $word, $this->key_length, "\x00" ) );
if ( false === $word_at ) {
return false;
}

return $this->small_mappings[ $word_at / $this->key_length ];
}

$group_key = substr( $word, 0, $this->key_length );
if ( ! isset( $this->large_words[ $group_key ] ) ) {
return false;
}

$group = $this->large_words[ $group_key ];
$slug = substr( $word, $this->key_length );
$length = strlen( $slug );
$at = 0;
while ( $at < strlen( $group ) ) {
$token_length = unpack( 'C', $group[ $at++ ] )[1];
$token_at = $at;
$at += $token_length;
$mapping_length = unpack( 'C', $group[ $at++ ] )[1];
$mapping_at = $at;

if ( $token_length === $length && 0 === substr_compare( $group, $slug, $token_at, $token_length ) ) {
return substr( $group, $mapping_at, $mapping_length );
}

$at = $mapping_at + $mapping_length;
}

return false;
}

public function read_token( $text, $offset, &$skip_bytes ) {
$text_length = strlen( $text );

// Search for a long word first, if the text is long enough, and if that fails, a short one.
if ( $this->key_length < $text_length ) {
$group_key = substr( $text, $offset, $this->key_length );

if ( ! isset( $this->large_words[ $group_key ] ) ) {
return false;
}

$group = $this->large_words[ $group_key ];
$group_length = strlen( $group );
$at = 0;
while ( $at < $group_length ) {
$token_length = unpack( 'C', $group[ $at++ ] )[1];
$token = substr( $group, $at, $token_length );
$at += $token_length;
$mapping_length = unpack( 'C', $group[ $at++ ] )[1];
$mapping_at = $at;

if ( 0 === substr_compare( $text, $token, $offset + $this->key_length, $token_length ) ) {
$skip_bytes = $this->key_length + $token_length;
return substr( $group, $mapping_at, $mapping_length );
}

$at = $mapping_at + $mapping_length;
}
}

// Perhaps a short word then.
$small_text = str_pad( substr( $text, $offset, $this->key_length ), $this->key_length, "\x00" );
$at = strpos( $this->small_words, $small_text );

if ( false === $at ) {
return false;
}

$skip_bytes = strlen( trim( $small_text, "\x00" ) );
return $this->small_mappings[ $at / $this->key_length ];
}

public function to_array() {
$tokens = array();

$at = 0;
$small_mapping = 0;
while ( $at < strlen( $this->small_words ) ) {
$token = array();

$token[] = rtrim( substr( $this->small_words, $at, $this->key_length ), "\x00" );
$token[] = $this->small_mappings[ $small_mapping++ ];
$tokens[] = $token;

$at += $this->key_length;
}

foreach ( $this->large_words as $prefix => $group ) {
$at = 0;
while ( $at < strlen( $group ) ) {
$token = array();

$length = unpack( 'C', $group[ $at++ ] )[1];
$token[] = $prefix . substr( $group, $at, $length );

$at += $length;
$length = unpack( 'C', $group[ $at++ ] )[1];
$token[] = substr( $group, $at, $length );

$tokens[] = $token;
$at += $length;
}
}

return $tokens;
}

public function precomputed_php_source_table( $indent = "\t" ) {
$i1 = $indent;
$i2 = $indent . $indent;

$output = self::class . "::from_precomputed_table(\n";
$output .= "{$i1}{$this->key_length},\n";
$output .= "{$i1}array(\n";

$prefixes = array_keys( $this->large_words );
sort( $prefixes );
foreach ( $prefixes as $prefix ) {
$group = $this->large_words[ $prefix ];
$comment_line = "{$i2}//";
$data_line = "{$i2}'{$prefix}' => \"";
$at = 0;
while ( $at < strlen( $group ) ) {
$token_length = unpack( 'C', $group[ $at++ ] )[1];
$token = substr( $group, $at, $token_length );
$at += $token_length;
$mapping_length = unpack( 'C', $group[ $at++ ] )[1];
$mapping = substr( $group, $at, $mapping_length );
$at += $mapping_length;

$token_digits = str_pad( dechex( $token_length ), 2, '0', STR_PAD_LEFT );
$mapping_digits = str_pad( dechex( $mapping_length ), 2, '0', STR_PAD_LEFT );

$mapping = preg_replace_callback(
"~[\x00-\x1f\"]~",
static function ( $match ) {
if ( '"' === $match[0] ) {
return '\\"';
}
$hex = dechex( ord( $match[0] ) );
return "\\x{$hex}";
},
$mapping
);

$comment_line .= " {$prefix}{$token}[{$mapping}]";
$data_line .= "\\x{$token_digits}{$token}\\x{$mapping_digits}{$mapping}";
}
$comment_line .= "\n";
$data_line .= "\",\n";

$output .= $comment_line;
$output .= $data_line;
}

$output .= "{$i1}),\n";

$small_words = array();
$at = 0;
while ( $at < strlen( $this->small_words ) ) {
$small_words[] = substr( $this->small_words, $at, $this->key_length );
$at += $this->key_length;
}
// sort( $small_words );

Check failure on line 268 in src/wp-includes/class-wp-token-map.php

View workflow job for this annotation

GitHub Actions / PHP coding standards

Line indented incorrectly; expected at least 2 tabs, found 0

Check failure on line 268 in src/wp-includes/class-wp-token-map.php

View workflow job for this annotation

GitHub Actions / PHP coding standards

Spaces must be used for mid-line alignment; tabs are not allowed

$small_text = str_replace( "\x00", '\x00', implode( '', $small_words ) );
$output .= "{$i1}\"{$small_text}\",\n";

$output .= "{$i1}array(\n";
foreach ( $this->small_mappings as $mapping ) {
$output .= "{$i2}\"{$mapping}\",\n";
}
$output .= "{$i1})\n";

$output .= ");\n";

return $output;
}

private static function longest_first_then_alphabetical( $a, $b ) {
if ( $a[0] === $b[0] ) {
return 0;
}

$la = strlen( $a[0] );
$lb = strlen( $b[0] );

// Longer strings are less-than for comparison's sake.
if ( $la !== $lb ) {
return $lb - $la;
}

return strcmp( $a[0], $b[0] );
}
}
38 changes: 23 additions & 15 deletions src/wp-includes/html-api/class-wp-html-decoder.php
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,9 @@ public static function read_character_reference( $text, $at, $allow_ambiguous_am
}

// Cannot encode invalid Unicode code points. Max is to U+10FFFF.
$digit_count = strspn( $text, $numeric_digits, $digits_at );
$after_digits = $digits_at + $digit_count;
$zero_count = strspn( $text, '0', $digits_at );
$digit_count = strspn( $text, $numeric_digits, $digits_at + $zero_count );
$after_digits = $digits_at + $zero_count + $digit_count;
$has_semicolon = $after_digits < $length && ';' === $text[ $after_digits ];
$end_of_span = $has_semicolon ? $after_digits + 1 : $after_digits;

Expand All @@ -111,12 +112,12 @@ public static function read_character_reference( $text, $at, $allow_ambiguous_am
return null;
}

if ( $digit_count > $max_digits ) {
if ( $digit_count - $zero_count > $max_digits ) {
$skip_bytes = $end_of_span - $at;
return '';
}

$digits = substr( $text, $digits_at, $digit_count );
$digits = substr( $text, $digits_at + $zero_count, $digit_count );
$code_point = intval( $digits, $numeric_base );

if (
Expand All @@ -129,7 +130,18 @@ public static function read_character_reference( $text, $at, $allow_ambiguous_am
// Surrogate.
( $code_point >= 0xD800 && $code_point <= 0xDFFF ) ||

// Noncharacters.
/*
* Noncharacters.
*
* > A noncharacter is a code point that is in the range U+FDD0 to U+FDEF,
* > inclusive, or U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF,
* > U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE,
* > U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF,
* > U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE,
* > U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, or U+10FFFF.
*
* @see https://infra.spec.whatwg.org/#noncharacter
*/
( $code_point >= 0xFDD0 && $code_point <= 0xFDEF ) ||
( 0xFFFE === ( $code_point & 0xFFFE ) ) ||

Expand Down Expand Up @@ -204,22 +216,18 @@ public static function read_character_reference( $text, $at, $allow_ambiguous_am
return null;
}

// Advance past the `&`.
++$name_at;

$name = $html5_named_character_entity_set->read_token( $text, $name_at );
$name = $html5_named_character_entity_set->read_token( $text, $name_at, $name_length );
if ( false === $name ) {
return null;
}

$name_length = strlen( $name );
$after_name = $name_at + $name_length;
$after_name = $name_at + $name_length;

// If we have an un-ambiguous ampersand we can safely leave it in.
if ( ';' === $name[ $name_length - 1 ] ) {
if ( ';' === $text[ $name_at + $name_length - 1 ] ) {
$skip_bytes = $after_name - $at;
// @todo bring back the WP_Token_Map so we can decode these.
return html_entity_decode( "&{$name}" );
return $name;
}

/*
Expand All @@ -240,15 +248,15 @@ public static function read_character_reference( $text, $at, $allow_ambiguous_am
if ( ! $ambiguous_follower ) {
$skip_bytes = $after_name - $at;
// @todo Bring back WP_Token_Map to replace properly.
return html_entity_decode( "&{$name};" );
return $name;
}

if ( ! $allow_ambiguous_ampersand ) {
return null;
}

$skip_bytes = $after_name - $at;
return html_entity_decode( "&{$name};" );
return $name;
}

public static function code_point_to_utf8_bytes( $code_point ) {
Expand Down
Loading

0 comments on commit 18f481f

Please sign in to comment.