Replace preg_match with strspn and UTF-8 decoder when handling identi…

…fiers
WordPress · Nov 7, 2024 · 04bf6e5 · 04bf6e5
1 parent 7d5869b
commit 04bf6e5
Show file tree

Hide file tree

Showing 6 changed files with 378 additions and 32 deletions.
diff --git a/tests/bootstrap.php b/tests/bootstrap.php
@@ -1,6 +1,7 @@
 <?php
 
 require_once __DIR__ . '/wp-sqlite-schema.php';
+require_once __DIR__ . '/../wp-includes/utf8-decoder.php';
 require_once __DIR__ . '/../wp-includes/mysql/class-wp-mysql-token.php';
 require_once __DIR__ . '/../wp-includes/mysql/class-wp-mysql-lexer.php';
 require_once __DIR__ . '/../wp-includes/parser/class-wp-parser-grammar.php';

diff --git a/tests/tools/run-lexer-benchmark.php b/tests/tools/run-lexer-benchmark.php
@@ -12,6 +12,7 @@ function ( $severity, $message, $file, $line ) {
 	}
 );
 
+require_once __DIR__ . '/../../wp-includes/utf8-decoder.php';
 require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-token.php';
 require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-lexer.php';
 

diff --git a/tests/tools/run-parser-benchmark.php b/tests/tools/run-parser-benchmark.php
@@ -13,6 +13,7 @@ function ( $severity, $message, $file, $line ) {
 	}
 );
 
+require_once __DIR__ . '/../../wp-includes/utf8-decoder.php';
 require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-token.php';
 require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-lexer.php';
 require_once __DIR__ . '/../../wp-includes/parser/class-wp-parser-grammar.php';

diff --git a/tests/tools/run-parser-test.php b/tests/tools/run-parser-test.php
@@ -12,6 +12,7 @@ function ( $severity, $message, $file, $line ) {
 	}
 );
 
+require_once __DIR__ . '/../../wp-includes/utf8-decoder.php';
 require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-token.php';
 require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-lexer.php';
 require_once __DIR__ . '/../../wp-includes/parser/class-wp-parser.php';

diff --git a/wp-includes/mysql/class-wp-mysql-lexer.php b/wp-includes/mysql/class-wp-mysql-lexer.php
@@ -29,16 +29,6 @@ class WP_MySQL_Lexer {
 		"\f" => true,
 	);
 
-	/**
-	 * Unquoted identifiers:
-	 *   https://dev.mysql.com/doc/refman/8.4/en/identifiers.html
-	 *
-	 * Rules:
-	 *   1. Allowed characters are ASCII a-z, A-Z, 0-9, $, _ and Unicode \x{0080}-\x{ffff}.
-	 *   2. Unquoted identifiers may begin with a digit but may not consist solely of digits.
-	 */
-	const PATTERN_UNQUOTED_IDENTIFIER = '[a-zA-Z0-9_$\x{80}-\x{ffff}]*[a-zA-Z_$\x{80}-\x{ffff}][a-zA-Z0-9_$\x{80}-\x{ffff}]*';
-
 	/**
 	 * Tokens from the MySQL Workbench "predefined.tokens" list, including token numbers.
 	 * See:
@@ -2365,28 +2355,32 @@ private function next_token() {
 				$this->text = $prefix . $this->text;
 				$this->type = self::NCHAR_TEXT;
 			}
-		} elseif ( preg_match( '/\G' . self::PATTERN_UNQUOTED_IDENTIFIER . '/u', $this->input, $matches, 0, $this->position ) ) {
-			$p               = $this->position - 1;
-			$this->text      = $matches[0];
-			$this->position += strlen( $this->text );
-			$this->c         = $this->input[ $this->position ] ?? null;
-			$this->n         = $this->input[ $this->position + 1 ] ?? null;
-
-			// When preceded by a dot, it is always an identifier.
-			if ( $p >= 0 && '.' === $this->input[ $p ] ) {
-				$this->type = self::IDENTIFIER;
-			} elseif ( '_' === $la && isset( self::UNDERSCORE_CHARSETS[ strtolower( $this->text ) ] ) ) {
-				$this->type = self::UNDERSCORE_CHARSET;
-			} else {
-				$this->identifier_or_keyword();
-			}
 		} elseif ( null === $la ) {
 			$this->match_eof();
 			$this->token_instance = new WP_MySQL_Token( self::EOF, '<EOF>' );
 			return false;
 		} else {
-			$this->consume();
-			$this->type = self::INVALID_INPUT;
+			$previous_position = $this->position - 1;
+			$bytes_parsed      = $this->parse_identifier();
+
+			if ( $bytes_parsed > 0 ) {
+				$this->text      = substr( $this->input, $this->position, $bytes_parsed );
+				$this->position += $bytes_parsed;
+				$this->c         = $this->input[ $this->position ] ?? null;
+				$this->n         = $this->input[ $this->position + 1 ] ?? null;
+
+				// When preceded by a dot, it is always an identifier.
+				if ( $previous_position >= 0 && '.' === $this->input[ $previous_position ] ) {
+					$this->type = self::IDENTIFIER;
+				} elseif ( '_' === $la && isset( self::UNDERSCORE_CHARSETS[ strtolower( $this->text ) ] ) ) {
+					$this->type = self::UNDERSCORE_CHARSET;
+				} else {
+					$this->identifier_or_keyword();
+				}
+			} else {
+				$this->consume();
+				$this->type = self::INVALID_INPUT;
+			}
 		}
 
 		$this->token_instance = null === $this->type ? null : new WP_MySQL_Token( $this->type, $this->text, $this->channel );
@@ -2412,6 +2406,58 @@ protected function match_eof() {
 		}
 	}
 
+	/**
+	 * Unquoted identifiers:
+	 *   https://dev.mysql.com/doc/refman/8.4/en/identifiers.html
+	 *
+	 * Rules:
+	 *   1. Allowed characters are ASCII a-z, A-Z, 0-9, _, $, and Unicode \x{0080}-\x{ffff}.
+	 *   2. Unquoted identifiers may begin with a digit but may not consist solely of digits.
+	 */
+	private function parse_identifier(): int {
+		$byte_length = 0;
+
+		while ( true ) {
+			// First, let's try to parse an ASCII sequence.
+			$byte_length += strspn(
+				$this->input,
+				'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$',
+				$this->position + $byte_length
+			);
+
+			// Check if the following byte can be part of a multibyte character.
+			// If not, bail out early to avoid unnecessary UTF-8 decoding.
+			$byte = $this->input[ $this->position + $byte_length ] ?? null;
+			if ( null === $byte || ord( $byte ) < 128 ) {
+				break;
+			}
+
+			// Check the \x{0080}-\x{ffff} Unicode character range.
+			$codepoint = utf8_codepoint_at(
+				$this->input,
+				$this->position + $byte_length,
+				$bytes_parsed
+			);
+
+			if (
+				null === $codepoint
+				|| ! ( 0x80 <= $codepoint && 0xffff >= $codepoint )
+			) {
+				break;
+			}
+
+			$byte_length += $bytes_parsed;
+		}
+
+		// An identifier cannot consist solely of digits.
+		if (
+			$byte_length === strspn( $this->input, '0123456789', $this->position, $byte_length )
+		) {
+			return 0;
+		}
+		return $byte_length;
+	}
+
 	protected function identifier_or_keyword() {
 		$text = strtoupper( $this->get_text() );
 
@@ -2523,13 +2569,16 @@ protected function number() {
 			self::INT_NUMBER === $this->type
 			|| ( '0' === $this->text[0] && ( 'b' === $this->text[1] || 'x' === $this->text[1] ) );
 
-		if ( $possible_identifier_prefix && preg_match( '/\G' . self::PATTERN_UNQUOTED_IDENTIFIER . '/u', $this->input, $matches, 0, $start_position ) ) {
-			$end_position = $start_position + strlen( $matches[0] );
+		if ( $possible_identifier_prefix ) {
+			$position       = $this->position;
+			$this->position = $start_position;
+			$bytes_parsed   = $this->parse_identifier();
+			$this->position = $position;
 
 			// When matched more than the number, it's an identifier.
-			if ( $end_position > $this->position ) {
-				$this->text     = $matches[0];
-				$this->position = $end_position;
+			if ( $start_position + $bytes_parsed > $this->position ) {
+				$this->text     = substr( $this->input, $start_position, $bytes_parsed );
+				$this->position = $start_position + $bytes_parsed;
 				$this->c        = $this->input[ $this->position ] ?? null;
 				$this->n        = $this->input[ $this->position + 1 ] ?? null;
 				$this->type     = self::IDENTIFIER;