Skip to content

Commit 17641f7

Browse files
committed
Check for the U+0080-U+FFFF range manually, add test coverage
1 parent dff4649 commit 17641f7

File tree

7 files changed

+103
-311
lines changed

7 files changed

+103
-311
lines changed

tests/bootstrap.php

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
<?php
22

33
require_once __DIR__ . '/wp-sqlite-schema.php';
4-
require_once __DIR__ . '/../wp-includes/utf8-decoder.php';
54
require_once __DIR__ . '/../wp-includes/mysql/class-wp-mysql-token.php';
65
require_once __DIR__ . '/../wp-includes/mysql/class-wp-mysql-lexer.php';
76
require_once __DIR__ . '/../wp-includes/parser/class-wp-parser-grammar.php';

tests/mysql/WP_MySQL_Lexer_Tests.php

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,75 @@
33
use PHPUnit\Framework\TestCase;
44

55
class WP_MySQL_Lexer_Tests extends TestCase {
6+
/**
7+
* Test that the whole U+0080 to U+FFFF UTF-8 range is valid in an identifier.
8+
* The validity is checked against PCRE with the "u" (PCRE_UTF8) modifier set.
9+
*/
10+
public function test_identifier_utf8_range(): void {
11+
for ( $i = 0x80; $i < 0xffff; $i += 1 ) {
12+
$value = mb_chr( $i, 'UTF-8' );
13+
$lexer = new WP_MySQL_Lexer( $value );
14+
$type = $lexer->next_token()->get_type();
15+
$is_valid = preg_match( '/^[\x{0080}-\x{ffff}]$/u', $value );
16+
if ( $is_valid ) {
17+
$this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $type );
18+
} elseif ( strlen( $value ) === 0 ) {
19+
$this->assertSame( WP_MySQL_Lexer::EOF, $type );
20+
} else {
21+
$this->assertSame( WP_MySQL_Lexer::INVALID_INPUT, $type );
22+
}
23+
}
24+
}
25+
26+
/**
27+
* Test all valid and invalid 2-byte UTF-8 sequences in an identifier.
28+
* The validity is checked against PCRE with the "u" (PCRE_UTF8) modifier set.
29+
*
30+
* Start both bytes from 128 and go up to 255 to include all invalid 2-byte
31+
* UTF-8 sequences as well, and ensure that they won't match as identifiers.
32+
*/
33+
public function test_identifier_utf8_two_byte_sequences(): void {
34+
for ( $byte_1 = 128; $byte_1 <= 255; $byte_1 += 1 ) {
35+
for ( $byte_2 = 128; $byte_2 <= 255; $byte_2 += 1 ) {
36+
$value = chr( $byte_1 ) . chr( $byte_2 );
37+
$is_valid = preg_match( '/^[\x{0080}-\x{ffff}]$/u', $value );
38+
$lexer = new WP_MySQL_Lexer( $value );
39+
$type = $lexer->next_token()->get_type();
40+
if ( $is_valid ) {
41+
$this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $type );
42+
} else {
43+
$this->assertSame( WP_MySQL_Lexer::INVALID_INPUT, $type );
44+
}
45+
}
46+
}
47+
}
48+
49+
/**
50+
* Test all valid and invalid 3-byte UTF-8 sequences in an identifier.
51+
* The validity is checked against PCRE with the "u" (PCRE_UTF8) modifier set.
52+
*
53+
* Start the first byte from 0xE0 to mark the beginning of a 3-byte sequence.
54+
* Start bytes 2 and 3 from 128 and go up to 255 to include all invalid 3-byte
55+
* UTF-8 sequences as well, and ensure that they won't match as identifiers.
56+
*/
57+
public function test_identifier_utf8_three_byte_sequences(): void {
58+
for ( $byte_1 = 0xE0; $byte_1 <= 0xFF; $byte_1 += 1 ) {
59+
for ( $byte_2 = 128; $byte_2 <= 255; $byte_2 += 1 ) {
60+
for ( $byte_3 = 128; $byte_3 <= 255; $byte_3 += 1 ) {
61+
$value = chr( $byte_1 ) . chr( $byte_2 ) . chr( $byte_3 );
62+
$is_valid = preg_match( '/^[\x{0080}-\x{ffff}]$/u', $value );
63+
$lexer = new WP_MySQL_Lexer( $value );
64+
$type = $lexer->next_token()->get_type();
65+
if ( $is_valid ) {
66+
$this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $type );
67+
} else {
68+
$this->assertSame( WP_MySQL_Lexer::INVALID_INPUT, $type );
69+
}
70+
}
71+
}
72+
}
73+
}
74+
675
/**
776
* Numbers vs. identifiers:
877
*

tests/tools/run-lexer-benchmark.php

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ function ( $severity, $message, $file, $line ) {
1212
}
1313
);
1414

15-
require_once __DIR__ . '/../../wp-includes/utf8-decoder.php';
1615
require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-token.php';
1716
require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-lexer.php';
1817

tests/tools/run-parser-benchmark.php

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ function ( $severity, $message, $file, $line ) {
1313
}
1414
);
1515

16-
require_once __DIR__ . '/../../wp-includes/utf8-decoder.php';
1716
require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-token.php';
1817
require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-lexer.php';
1918
require_once __DIR__ . '/../../wp-includes/parser/class-wp-parser-grammar.php';

tests/tools/run-parser-test.php

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ function ( $severity, $message, $file, $line ) {
1212
}
1313
);
1414

15-
require_once __DIR__ . '/../../wp-includes/utf8-decoder.php';
1615
require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-token.php';
1716
require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-lexer.php';
1817
require_once __DIR__ . '/../../wp-includes/parser/class-wp-parser.php';

wp-includes/mysql/class-wp-mysql-lexer.php

Lines changed: 34 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2483,7 +2483,7 @@ private function get_current_token_bytes(): string {
24832483
* https://dev.mysql.com/doc/refman/8.4/en/identifiers.html
24842484
*
24852485
* Rules:
2486-
* 1. Allowed characters are ASCII a-z, A-Z, 0-9, _, $, and Unicode \x{0080}-\x{ffff}.
2486+
* 1. Allowed characters are ASCII a-z, A-Z, 0-9, _, $, and Unicode U+0080-U+FFFF.
24872487
* 2. Unquoted identifiers may begin with a digit but may not consist solely of digits.
24882488
*/
24892489
private function parse_identifier(): int {
@@ -2497,28 +2497,48 @@ private function parse_identifier(): int {
24972497
$this->bytes_already_read + $byte_length
24982498
);
24992499

2500-
// Check if the following byte can be part of a multibyte character.
2501-
// If not, bail out early to avoid unnecessary UTF-8 decoding.
2502-
$byte = $this->sql[ $this->bytes_already_read + $byte_length ] ?? null;
2503-
if ( null === $byte || ord( $byte ) < 128 ) {
2500+
// Check if the following byte can be part of a multibyte character
2501+
// in the range of U+0080 to U+FFFF before looking at further bytes.
2502+
// If it can't, bail out early to avoid unnecessary UTF-8 decoding.
2503+
// Identifiers are usually ASCII-only, so we can optimize for that.
2504+
$byte_1 = ord(
2505+
$this->sql[ $this->bytes_already_read + $byte_length ] ?? ''
2506+
);
2507+
if ( $byte_1 < 0xC2 || $byte_1 > 0xEF ) {
25042508
break;
25052509
}
25062510

2507-
// Check the \x{0080}-\x{ffff} Unicode character range.
2508-
$codepoint = utf8_codepoint_at(
2509-
$this->sql,
2510-
$this->bytes_already_read + $byte_length,
2511-
$bytes_parsed
2511+
// Look for a valid 2-byte UTF-8 symbol. Covers range U+0080 - U+07FF.
2512+
$byte_2 = ord(
2513+
$this->sql[ $this->bytes_already_read + $byte_length + 1 ] ?? ''
25122514
);
2515+
if (
2516+
$byte_1 <= 0xDF
2517+
&& $byte_2 >= 0x80 && $byte_2 <= 0xBF
2518+
) {
2519+
$byte_length += 2;
2520+
continue;
2521+
}
25132522

2523+
// Look for a valid 3-byte UTF-8 symbol in range U+0800 - U+FFFF.
2524+
$byte_3 = ord(
2525+
$this->sql[ $this->bytes_already_read + $byte_length + 2 ] ?? ''
2526+
);
25142527
if (
2515-
null === $codepoint
2516-
|| ! ( 0x80 <= $codepoint && 0xffff >= $codepoint )
2528+
$byte_1 <= 0xEF
2529+
&& $byte_2 >= 0x80 && $byte_2 <= 0xBF
2530+
&& $byte_3 >= 0x80 && $byte_3 <= 0xBF
2531+
// Exclude surrogate range U+D800 to U+DFFF:
2532+
&& ! ( 0xED === $byte_1 && $byte_2 >= 0xA0 )
2533+
// Exclude overlong encodings:
2534+
&& ! ( 0xE0 === $byte_1 && $byte_2 < 0xA0 )
25172535
) {
2518-
break;
2536+
$byte_length += 3;
2537+
continue;
25192538
}
25202539

2521-
$byte_length += $bytes_parsed;
2540+
// Not a valid identifier character.
2541+
break;
25222542
}
25232543

25242544
// An identifier cannot consist solely of digits.

0 commit comments

Comments
 (0)