Skip to content

Commit

Permalink
Implement "next_token()" & "get_next_token" API
Browse files Browse the repository at this point in the history
  • Loading branch information
JanJakes committed Nov 14, 2024
1 parent 7155b7b commit 9426969
Show file tree
Hide file tree
Showing 2 changed files with 163 additions and 53 deletions.
111 changes: 87 additions & 24 deletions tests/mysql/WP_MySQL_Lexer_Tests.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,75 @@
use PHPUnit\Framework\TestCase;

class WP_MySQL_Lexer_Tests extends TestCase {
public function test_tokenize_valid_input(): void {
$lexer = new WP_MySQL_Lexer( 'SELECT id FROM users' );

// SELECT
$this->assertTrue( $lexer->next_token() );
$this->assertSame( WP_MySQL_Lexer::SELECT_SYMBOL, $lexer->get_token()->get_type() );

// id
$this->assertTrue( $lexer->next_token() );
$this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $lexer->get_token()->get_type() );

// FROM
$this->assertTrue( $lexer->next_token() );
$this->assertSame( WP_MySQL_Lexer::FROM_SYMBOL, $lexer->get_token()->get_type() );

// users
$this->assertTrue( $lexer->next_token() );
$this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $lexer->get_token()->get_type() );

// EOF
$this->assertTrue( $lexer->next_token() );
$this->assertSame( WP_MySQL_Lexer::EOF, $lexer->get_token()->get_type() );

// No more tokens.
$this->assertFalse( $lexer->next_token() );
$this->assertNull( $lexer->get_token() );

// Again, no more tokens.
$this->assertFalse( $lexer->next_token() );
$this->assertNull( $lexer->get_token() );
}

public function test_tokenize_invalid_input(): void {
$lexer = new WP_MySQL_Lexer( "SELECT x'ab01xyz'" );

// SELECT
$this->assertTrue( $lexer->next_token() );
$this->assertSame( WP_MySQL_Lexer::SELECT_SYMBOL, $lexer->get_token()->get_type() );

// Invalid input.
$this->assertFalse( $lexer->next_token() );
$this->assertNull( $lexer->get_token() );

// No more tokens.
$this->assertFalse( $lexer->next_token() );
$this->assertNull( $lexer->get_token() );

// Again, no more tokens.
$this->assertFalse( $lexer->next_token() );
$this->assertNull( $lexer->get_token() );
}

/**
* Test that the whole U+0080 to U+FFFF UTF-8 range is valid in an identifier.
* The validity is checked against PCRE with the "u" (PCRE_UTF8) modifier set.
*/
public function test_identifier_utf8_range(): void {
for ( $i = 0x80; $i < 0xffff; $i += 1 ) {
$value = mb_chr( $i, 'UTF-8' );
$lexer = new WP_MySQL_Lexer( $value );
$type = $lexer->next_token()->get_type();
$value = mb_chr( $i, 'UTF-8' );

$lexer = new WP_MySQL_Lexer( $value );
$this->assertTrue( $lexer->next_token() );

$type = $lexer->get_token()->get_type();
$is_valid = preg_match( '/^[\x{0080}-\x{ffff}]$/u', $value );
if ( $is_valid ) {
$this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $type );
} elseif ( strlen( $value ) === 0 ) {
$this->assertSame( WP_MySQL_Lexer::EOF, $type );
} else {
$this->assertSame( WP_MySQL_Lexer::INVALID_INPUT, $type );
$this->assertSame( WP_MySQL_Lexer::EOF, $type );
}
}
}
Expand All @@ -33,14 +86,19 @@ public function test_identifier_utf8_range(): void {
public function test_identifier_utf8_two_byte_sequences(): void {
for ( $byte_1 = 128; $byte_1 <= 255; $byte_1 += 1 ) {
for ( $byte_2 = 128; $byte_2 <= 255; $byte_2 += 1 ) {
$value = chr( $byte_1 ) . chr( $byte_2 );
$value = chr( $byte_1 ) . chr( $byte_2 );

$lexer = new WP_MySQL_Lexer( $value );
$result = $lexer->next_token();
$token = $lexer->get_token();

$is_valid = preg_match( '/^[\x{0080}-\x{ffff}]$/u', $value );
$lexer = new WP_MySQL_Lexer( $value );
$type = $lexer->next_token()->get_type();
if ( $is_valid ) {
$this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $type );
$this->assertTrue( $result );
$this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $token->get_type() );
} else {
$this->assertSame( WP_MySQL_Lexer::INVALID_INPUT, $type );
$this->assertFalse( $result );
$this->assertNull( $token );
}
}
}
Expand All @@ -58,14 +116,19 @@ public function test_identifier_utf8_three_byte_sequences(): void {
for ( $byte_1 = 0xE0; $byte_1 <= 0xFF; $byte_1 += 1 ) {
for ( $byte_2 = 128; $byte_2 <= 255; $byte_2 += 1 ) {
for ( $byte_3 = 128; $byte_3 <= 255; $byte_3 += 1 ) {
$value = chr( $byte_1 ) . chr( $byte_2 ) . chr( $byte_3 );
$value = chr( $byte_1 ) . chr( $byte_2 ) . chr( $byte_3 );

$lexer = new WP_MySQL_Lexer( $value );
$result = $lexer->next_token();
$token = $lexer->get_token();

$is_valid = preg_match( '/^[\x{0080}-\x{ffff}]$/u', $value );
$lexer = new WP_MySQL_Lexer( $value );
$type = $lexer->next_token()->get_type();
if ( $is_valid ) {
$this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $type );
$this->assertTrue( $result );
$this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $token->get_type() );
} else {
$this->assertSame( WP_MySQL_Lexer::INVALID_INPUT, $type );
$this->assertFalse( $result );
$this->assertNull( $token );
}
}
}
Expand All @@ -77,8 +140,8 @@ public function test_identifier_utf8_three_byte_sequences(): void {
*/
public function test_integer_types( $input, $expected ): void {
$lexer = new WP_MySQL_Lexer( $input );
$type = $lexer->next_token()->get_type();
$this->assertSame( $expected, $type );
$this->assertTrue( $lexer->next_token() );
$this->assertSame( $expected, $lexer->get_token()->get_type() );
}

public function data_integer_types(): array {
Expand Down Expand Up @@ -145,20 +208,20 @@ public function data_identifier_or_number(): array {
array( '0b01xyz', array( WP_MySQL_Lexer::IDENTIFIER, WP_MySQL_Lexer::EOF ) ), // identifier
array( '0b', array( WP_MySQL_Lexer::IDENTIFIER, WP_MySQL_Lexer::EOF ) ), // identifier
array( "b'01'", array( WP_MySQL_Lexer::BIN_NUMBER, WP_MySQL_Lexer::EOF ) ),
array( "b'01xyz'", array( WP_MySQL_Lexer::INVALID_INPUT, WP_MySQL_Lexer::IDENTIFIER, WP_MySQL_Lexer::INVALID_INPUT, WP_MySQL_Lexer::EOF ) ),
array( "b'01xyz'", array() ), // invalid input
array( "b''", array( WP_MySQL_Lexer::BIN_NUMBER, WP_MySQL_Lexer::EOF ) ),
array( "b'", array( WP_MySQL_Lexer::INVALID_INPUT, WP_MySQL_Lexer::EOF ) ),
array( "b'01", array( WP_MySQL_Lexer::INVALID_INPUT, WP_MySQL_Lexer::EOF ) ),
array( "b'", array() ), // invalid input
array( "b'01", array() ), // invalid input

// hex
array( '0xab01', array( WP_MySQL_Lexer::HEX_NUMBER, WP_MySQL_Lexer::EOF ) ),
array( '0xab01xyz', array( WP_MySQL_Lexer::IDENTIFIER, WP_MySQL_Lexer::EOF ) ), // identifier
array( '0x', array( WP_MySQL_Lexer::IDENTIFIER, WP_MySQL_Lexer::EOF ) ), // identifier
array( "x'ab01'", array( WP_MySQL_Lexer::HEX_NUMBER, WP_MySQL_Lexer::EOF ) ),
array( "x'ab01xyz'", array( WP_MySQL_Lexer::INVALID_INPUT, WP_MySQL_Lexer::IDENTIFIER, WP_MySQL_Lexer::INVALID_INPUT, WP_MySQL_Lexer::EOF ) ),
array( "x'ab01xyz'", array() ), // invalid input
array( "x''", array( WP_MySQL_Lexer::HEX_NUMBER, WP_MySQL_Lexer::EOF ) ),
array( "x'", array( WP_MySQL_Lexer::INVALID_INPUT, WP_MySQL_Lexer::EOF ) ),
array( "x'ab", array( WP_MySQL_Lexer::INVALID_INPUT, WP_MySQL_Lexer::EOF ) ),
array( "x'", array() ), // invalid input
array( "x'ab", array() ), // invalid input

// decimal
array( '123.456', array( WP_MySQL_Lexer::DECIMAL_NUMBER, WP_MySQL_Lexer::EOF ) ),
Expand Down
105 changes: 76 additions & 29 deletions wp-includes/mysql/class-wp-mysql-lexer.php
Original file line number Diff line number Diff line change
Expand Up @@ -936,9 +936,8 @@ class WP_MySQL_Lexer {
const MYSQL_COMMENT_END = 902;

// Special tokens
const WHITESPACE = 0;
const EOF = -1;
const INVALID_INPUT = -2;
const WHITESPACE = 0;
const EOF = -1;

/**
* A map of SQL keyword string values to their corresponding token types.
Expand Down Expand Up @@ -2151,6 +2150,17 @@ class WP_MySQL_Lexer {
*/
private $token_starts_at = 0;

/**
* The type of the current token.
*
* When a token is successfully recognized and read, this value is set to the
* constant representing the token type. When no token was read yet, or the
* end of the SQL payload or an invalid token is reached, this value is null.
*
* @var int|null
*/
private $token_type;

/**
* Whether the tokenizer is inside an active MySQL-specific comment.
*
Expand Down Expand Up @@ -2184,22 +2194,56 @@ public function __construct(
*
* This method reads bytes from the SQL payload until a token is recognized.
* It starts from "$this->sql[ $this->bytes_already_read ]", advances the
* number of bytes read, and returns a WP_MySQL_Token object. When the end of
* the SQL payload is reached, the method always returns an EOF token.
* number of bytes read, and returns a boolean indicating whether a token
* was successfully recognized and read. When the end of the SQL payload
* or an invalid token is reached, the method returns false.
*
* @return WP_MySQL_Token A token object representing the next recognized token.
* @return bool Whether a token was successfully recognized and read.
*/
public function next_token(): WP_MySQL_Token {
public function next_token(): bool {
// We already reached the end of the SQL payload or an invalid token.
// Don't attempt to read any more bytes, and bail out immediately.
if (
self::EOF === $this->token_type
|| ( null === $this->token_type && $this->bytes_already_read > 0 )
) {
$this->token_type = null;
return false;
}

do {
$this->token_starts_at = $this->bytes_already_read;
$type = $this->read_next_token();
$this->token_type = $this->read_next_token();
} while (
self::WHITESPACE === $type
|| self::COMMENT === $type
|| self::MYSQL_COMMENT_START === $type
|| self::MYSQL_COMMENT_END === $type
self::WHITESPACE === $this->token_type
|| self::COMMENT === $this->token_type
|| self::MYSQL_COMMENT_START === $this->token_type
|| self::MYSQL_COMMENT_END === $this->token_type
);
return new WP_MySQL_Token( $type, $this->get_current_token_bytes() );

// Invalid input.
if ( null === $this->token_type ) {
return false;
}
return true;
}

/**
* Return the current token represented as a WP_MySQL_Token object.
*
* When no token was read yet, or the end of the SQL payload or an invalid
* token is reached, the method returns null.
*
* @TODO: Consider referential stability ($lexer->get_token() === $lexer->get_token()),
* or separate getters for the token type and token bytes (no token objects).
*
* @return WP_MySQL_Token|null An object representing the next recognized token or null.
*/
public function get_token(): ?WP_MySQL_Token {
if ( null === $this->token_type ) {
return null;
}
return new WP_MySQL_Token( $this->token_type, $this->get_current_token_bytes() );
}

/**
Expand All @@ -2209,17 +2253,20 @@ public function next_token(): WP_MySQL_Token {
* by "$this->sql[ $this->bytes_already_read ]", and reads all tokens until
* the end of the SQL payload is reached, returning an array of token objects.
*
* It can be used to tokenize the whole SQL payload at once, at the expense of
* storing all token objects in memory at the same time.
* When an invalid token is reached, the method stops and returns the partial
* sequence of valid tokens. In this case, the EOF token will not be included.
*
* This method can be used to tokenize the whole SQL payload at once, at the
* expense of storing all token objects in memory at the same time.
*
* @return WP_MySQL_Token[] An array of token objects representing the remaining tokens.
*/
public function remaining_tokens(): array {
$tokens = array();
do {
$token = $this->next_token();
while ( true === $this->next_token() ) {
$token = $this->get_token();
$tokens[] = $token;
} while ( WP_MySQL_Lexer::EOF !== $token->type );
}
return $tokens;
}

Expand Down Expand Up @@ -2281,7 +2328,7 @@ public static function get_token_name( int $token_id ): ?string {
return $token_name ? $token_name : null;
}

private function read_next_token(): int {
private function read_next_token(): ?int {
$byte = $this->sql[ $this->bytes_already_read ] ?? null;
$next_byte = $this->sql[ $this->bytes_already_read + 1 ] ?? null;

Expand Down Expand Up @@ -2362,13 +2409,13 @@ private function read_next_token(): int {
if ( $this->mysql_version >= 50713 ) {
$type = self::JSON_UNQUOTED_SEPARATOR_SYMBOL;
} else {
$type = self::INVALID_INPUT;
return null; // Invalid input.
}
} else {
if ( $this->mysql_version >= 50708 ) {
$type = self::JSON_SEPARATOR_SYMBOL;
} else {
$type = self::INVALID_INPUT;
return null; // Invalid input.
}
}
} else {
Expand Down Expand Up @@ -2474,7 +2521,7 @@ private function read_next_token(): int {
$this->bytes_already_read += 1; // Consume the 'N'.
$type = self::NULL2_SYMBOL;
} else {
$type = self::INVALID_INPUT;
return null; // Invalid input.
}
} elseif ( '#' === $byte ) {
$type = $this->read_line_comment();
Expand Down Expand Up @@ -2531,7 +2578,7 @@ private function get_current_token_bytes(): string {
* See:
* https://dev.mysql.com/doc/refman/8.4/en/identifiers.html
*/
private function read_identifier(): int {
private function read_identifier(): ?int {
$started_at = $this->bytes_already_read;
while ( true ) {
// First, let's try to parse an ASCII sequence.
Expand Down Expand Up @@ -2590,10 +2637,10 @@ private function read_identifier(): int {

return $this->bytes_already_read - $started_at > 0
? self::IDENTIFIER
: self::INVALID_INPUT;
: null; // Invalid input.
}

private function read_number(): int {
private function read_number(): ?int {
// @TODO: Support numeric-only identifier parts after "." (e.g., 1ea10.1).

$byte = $this->sql[ $this->bytes_already_read ] ?? null;
Expand All @@ -2619,7 +2666,7 @@ private function read_number(): int {
$this->bytes_already_read >= strlen( $this->sql )
|| "'" !== $this->sql[ $this->bytes_already_read ]
) {
return self::INVALID_INPUT;
return null; // Invalid input.
}
$this->bytes_already_read += 1; // Consume the "'".
}
Expand All @@ -2642,7 +2689,7 @@ private function read_number(): int {
$this->bytes_already_read >= strlen( $this->sql )
|| "'" !== $this->sql[ $this->bytes_already_read ]
) {
return self::INVALID_INPUT;
return null; // Invalid input.
}
$this->bytes_already_read += 1; // Consume the "'".
}
Expand Down Expand Up @@ -2759,7 +2806,7 @@ private function read_number(): int {
*
* @param string $quote The quote character - ', ", or `.
*/
private function read_quoted_text(): int {
private function read_quoted_text(): ?int {
$quote = $this->sql[ $this->bytes_already_read ];
$this->bytes_already_read += 1; // Consume the quote.

Expand Down Expand Up @@ -2792,7 +2839,7 @@ private function read_quoted_text(): int {

// Unclosed string - unexpected EOF.
if ( ( $this->sql[ $at ] ?? null ) !== $quote ) {
return self::INVALID_INPUT;
return null; // Invalid input.
}

// Check if the quote is doubled.
Expand Down

0 comments on commit 9426969

Please sign in to comment.