Check for the U+0080-U+FFFF range manually, add test coverage

JanJakes · JanJakes · commit 17641f7d1a59 · 2024-11-11T16:07:14.000+01:00
diff --git a/tests/bootstrap.php b/tests/bootstrap.php
@@ -1,7 +1,6 @@
 <?php
 
 require_once __DIR__ . '/wp-sqlite-schema.php';
-require_once __DIR__ . '/../wp-includes/utf8-decoder.php';
 require_once __DIR__ . '/../wp-includes/mysql/class-wp-mysql-token.php';
 require_once __DIR__ . '/../wp-includes/mysql/class-wp-mysql-lexer.php';
 require_once __DIR__ . '/../wp-includes/parser/class-wp-parser-grammar.php';
diff --git a/tests/mysql/WP_MySQL_Lexer_Tests.php b/tests/mysql/WP_MySQL_Lexer_Tests.php
@@ -3,6 +3,75 @@
 use PHPUnit\Framework\TestCase;
 
 class WP_MySQL_Lexer_Tests extends TestCase {
+	/**
+	 * Test that the whole U+0080 to U+FFFF UTF-8 range is valid in an identifier.
+	 * The validity is checked against PCRE with the "u" (PCRE_UTF8) modifier set.
+	 */
+	public function test_identifier_utf8_range(): void {
+		for ( $i = 0x80; $i < 0xffff; $i += 1 ) {
+			$value    = mb_chr( $i, 'UTF-8' );
+			$lexer    = new WP_MySQL_Lexer( $value );
+			$type     = $lexer->next_token()->get_type();
+			$is_valid = preg_match( '/^[\x{0080}-\x{ffff}]$/u', $value );
+			if ( $is_valid ) {
+				$this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $type );
+			} elseif ( strlen( $value ) === 0 ) {
+				$this->assertSame( WP_MySQL_Lexer::EOF, $type );
+			} else {
+				$this->assertSame( WP_MySQL_Lexer::INVALID_INPUT, $type );
+			}
+		}
+	}
+
+	/**
+	 * Test all valid and invalid 2-byte UTF-8 sequences in an identifier.
+	 * The validity is checked against PCRE with the "u" (PCRE_UTF8) modifier set.
+	 *
+	 * Start both bytes from 128 and go up to 255 to include all invalid 2-byte
+	 * UTF-8 sequences as well, and ensure that they won't match as identifiers.
+	 */
+	public function test_identifier_utf8_two_byte_sequences(): void {
+		for ( $byte_1 = 128; $byte_1 <= 255; $byte_1 += 1 ) {
+			for ( $byte_2 = 128; $byte_2 <= 255; $byte_2 += 1 ) {
+				$value    = chr( $byte_1 ) . chr( $byte_2 );
+				$is_valid = preg_match( '/^[\x{0080}-\x{ffff}]$/u', $value );
+				$lexer    = new WP_MySQL_Lexer( $value );
+				$type     = $lexer->next_token()->get_type();
+				if ( $is_valid ) {
+					$this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $type );
+				} else {
+					$this->assertSame( WP_MySQL_Lexer::INVALID_INPUT, $type );
+				}
+			}
+		}
+	}
+
+	/**
+	 * Test all valid and invalid 3-byte UTF-8 sequences in an identifier.
+	 * The validity is checked against PCRE with the "u" (PCRE_UTF8) modifier set.
+	 *
+	 * Start the first byte from 0xE0 to mark the beginning of a 3-byte sequence.
+	 * Start bytes 2 and 3 from 128 and go up to 255 to include all invalid 3-byte
+	 * UTF-8 sequences as well, and ensure that they won't match as identifiers.
+	 */
+	public function test_identifier_utf8_three_byte_sequences(): void {
+		for ( $byte_1 = 0xE0; $byte_1 <= 0xFF; $byte_1 += 1 ) {
+			for ( $byte_2 = 128; $byte_2 <= 255; $byte_2 += 1 ) {
+				for ( $byte_3 = 128; $byte_3 <= 255; $byte_3 += 1 ) {
+					$value    = chr( $byte_1 ) . chr( $byte_2 ) . chr( $byte_3 );
+					$is_valid = preg_match( '/^[\x{0080}-\x{ffff}]$/u', $value );
+					$lexer    = new WP_MySQL_Lexer( $value );
+					$type     = $lexer->next_token()->get_type();
+					if ( $is_valid ) {
+						$this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $type );
+					} else {
+						$this->assertSame( WP_MySQL_Lexer::INVALID_INPUT, $type );
+					}
+				}
+			}
+		}
+	}
+
 	/**
 	 * Numbers vs. identifiers:
 	 *
diff --git a/tests/tools/run-lexer-benchmark.php b/tests/tools/run-lexer-benchmark.php
@@ -12,7 +12,6 @@ function ( $severity, $message, $file, $line ) {
 	}
 );
 
-require_once __DIR__ . '/../../wp-includes/utf8-decoder.php';
 require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-token.php';
 require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-lexer.php';
 
diff --git a/tests/tools/run-parser-benchmark.php b/tests/tools/run-parser-benchmark.php
@@ -13,7 +13,6 @@ function ( $severity, $message, $file, $line ) {
 	}
 );
 
-require_once __DIR__ . '/../../wp-includes/utf8-decoder.php';
 require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-token.php';
 require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-lexer.php';
 require_once __DIR__ . '/../../wp-includes/parser/class-wp-parser-grammar.php';
diff --git a/tests/tools/run-parser-test.php b/tests/tools/run-parser-test.php
@@ -12,7 +12,6 @@ function ( $severity, $message, $file, $line ) {
 	}
 );
 
-require_once __DIR__ . '/../../wp-includes/utf8-decoder.php';
 require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-token.php';
 require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-lexer.php';
 require_once __DIR__ . '/../../wp-includes/parser/class-wp-parser.php';
diff --git a/wp-includes/mysql/class-wp-mysql-lexer.php b/wp-includes/mysql/class-wp-mysql-lexer.php
@@ -2483,7 +2483,7 @@ private function get_current_token_bytes(): string {
 	 *   https://dev.mysql.com/doc/refman/8.4/en/identifiers.html
 	 *
 	 * Rules:
-	 *   1. Allowed characters are ASCII a-z, A-Z, 0-9, _, $, and Unicode \x{0080}-\x{ffff}.
+	 *   1. Allowed characters are ASCII a-z, A-Z, 0-9, _, $, and Unicode U+0080-U+FFFF.
 	 *   2. Unquoted identifiers may begin with a digit but may not consist solely of digits.
 	 */
 	private function parse_identifier(): int {
@@ -2497,28 +2497,48 @@ private function parse_identifier(): int {
 				$this->bytes_already_read + $byte_length
 			);
 
-			// Check if the following byte can be part of a multibyte character.
-			// If not, bail out early to avoid unnecessary UTF-8 decoding.
-			$byte = $this->sql[ $this->bytes_already_read + $byte_length ] ?? null;
-			if ( null === $byte || ord( $byte ) < 128 ) {
+			// Check if the following byte can be part of a multibyte character
+			// in the range of U+0080 to U+FFFF before looking at further bytes.
+			// If it can't, bail out early to avoid unnecessary UTF-8 decoding.
+			// Identifiers are usually ASCII-only, so we can optimize for that.
+			$byte_1 = ord(
+				$this->sql[ $this->bytes_already_read + $byte_length ] ?? ''
+			);
+			if ( $byte_1 < 0xC2 || $byte_1 > 0xEF ) {
 				break;
 			}
 
-			// Check the \x{0080}-\x{ffff} Unicode character range.
-			$codepoint = utf8_codepoint_at(
-				$this->sql,
-				$this->bytes_already_read + $byte_length,
-				$bytes_parsed
+			// Look for a valid 2-byte UTF-8 symbol. Covers range U+0080 - U+07FF.
+			$byte_2 = ord(
+				$this->sql[ $this->bytes_already_read + $byte_length + 1 ] ?? ''
 			);
+			if (
+				$byte_1 <= 0xDF
+				&& $byte_2 >= 0x80 && $byte_2 <= 0xBF
+			) {
+				$byte_length += 2;
+				continue;
+			}
 
+			// Look for a valid 3-byte UTF-8 symbol in range U+0800 - U+FFFF.
+			$byte_3 = ord(
+				$this->sql[ $this->bytes_already_read + $byte_length + 2 ] ?? ''
+			);
 			if (
-				null === $codepoint
-				|| ! ( 0x80 <= $codepoint && 0xffff >= $codepoint )
+				$byte_1 <= 0xEF
+				&& $byte_2 >= 0x80 && $byte_2 <= 0xBF
+				&& $byte_3 >= 0x80 && $byte_3 <= 0xBF
+				// Exclude surrogate range U+D800 to U+DFFF:
+				&& ! ( 0xED === $byte_1 && $byte_2 >= 0xA0 )
+				// Exclude overlong encodings:
+				&& ! ( 0xE0 === $byte_1 && $byte_2 < 0xA0 )
 			) {
-				break;
+				$byte_length += 3;
+				continue;
 			}
 
-			$byte_length += $bytes_parsed;
+			// Not a valid identifier character.
+			break;
 		}
 
 		// An identifier cannot consist solely of digits.
diff --git a/wp-includes/utf8-decoder.php b/wp-includes/utf8-decoder.php

Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,6 @@ function ( $severity, $message, $file, $line ) {`
`12`	`12`	`}`
`13`	`13`	`);`
`14`	`14`
`15`		`-require_once __DIR__ . '/../../wp-includes/utf8-decoder.php';`
`16`	`15`	`require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-token.php';`
`17`	`16`	`require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-lexer.php';`
`18`	`17`
Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,6 @@ function ( $severity, $message, $file, $line ) {`
`13`	`13`	`}`
`14`	`14`	`);`
`15`	`15`
`16`		`-require_once __DIR__ . '/../../wp-includes/utf8-decoder.php';`
`17`	`16`	`require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-token.php';`
`18`	`17`	`require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-lexer.php';`
`19`	`18`	`require_once __DIR__ . '/../../wp-includes/parser/class-wp-parser-grammar.php';`