|
3 | 3 | use PHPUnit\Framework\TestCase; |
4 | 4 |
|
5 | 5 | class WP_MySQL_Lexer_Tests extends TestCase { |
| 6 | + /** |
| 7 | + * Test that the whole U+0080 to U+FFFF UTF-8 range is valid in an identifier. |
| 8 | + * The validity is checked against PCRE with the "u" (PCRE_UTF8) modifier set. |
| 9 | + */ |
| 10 | + public function test_identifier_utf8_range(): void { |
| 11 | + for ( $i = 0x80; $i < 0xffff; $i += 1 ) { |
| 12 | + $value = mb_chr( $i, 'UTF-8' ); |
| 13 | + $lexer = new WP_MySQL_Lexer( $value ); |
| 14 | + $type = $lexer->next_token()->get_type(); |
| 15 | + $is_valid = preg_match( '/^[\x{0080}-\x{ffff}]$/u', $value ); |
| 16 | + if ( $is_valid ) { |
| 17 | + $this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $type ); |
| 18 | + } elseif ( strlen( $value ) === 0 ) { |
| 19 | + $this->assertSame( WP_MySQL_Lexer::EOF, $type ); |
| 20 | + } else { |
| 21 | + $this->assertSame( WP_MySQL_Lexer::INVALID_INPUT, $type ); |
| 22 | + } |
| 23 | + } |
| 24 | + } |
| 25 | + |
| 26 | + /** |
| 27 | + * Test all valid and invalid 2-byte UTF-8 sequences in an identifier. |
| 28 | + * The validity is checked against PCRE with the "u" (PCRE_UTF8) modifier set. |
| 29 | + * |
| 30 | + * Start both bytes from 128 and go up to 255 to include all invalid 2-byte |
| 31 | + * UTF-8 sequences as well, and ensure that they won't match as identifiers. |
| 32 | + */ |
| 33 | + public function test_identifier_utf8_two_byte_sequences(): void { |
| 34 | + for ( $byte_1 = 128; $byte_1 <= 255; $byte_1 += 1 ) { |
| 35 | + for ( $byte_2 = 128; $byte_2 <= 255; $byte_2 += 1 ) { |
| 36 | + $value = chr( $byte_1 ) . chr( $byte_2 ); |
| 37 | + $is_valid = preg_match( '/^[\x{0080}-\x{ffff}]$/u', $value ); |
| 38 | + $lexer = new WP_MySQL_Lexer( $value ); |
| 39 | + $type = $lexer->next_token()->get_type(); |
| 40 | + if ( $is_valid ) { |
| 41 | + $this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $type ); |
| 42 | + } else { |
| 43 | + $this->assertSame( WP_MySQL_Lexer::INVALID_INPUT, $type ); |
| 44 | + } |
| 45 | + } |
| 46 | + } |
| 47 | + } |
| 48 | + |
| 49 | + /** |
| 50 | + * Test all valid and invalid 3-byte UTF-8 sequences in an identifier. |
| 51 | + * The validity is checked against PCRE with the "u" (PCRE_UTF8) modifier set. |
| 52 | + * |
| 53 | + * Start the first byte from 0xE0 to mark the beginning of a 3-byte sequence. |
| 54 | + * Start bytes 2 and 3 from 128 and go up to 255 to include all invalid 3-byte |
| 55 | + * UTF-8 sequences as well, and ensure that they won't match as identifiers. |
| 56 | + */ |
| 57 | + public function test_identifier_utf8_three_byte_sequences(): void { |
| 58 | + for ( $byte_1 = 0xE0; $byte_1 <= 0xFF; $byte_1 += 1 ) { |
| 59 | + for ( $byte_2 = 128; $byte_2 <= 255; $byte_2 += 1 ) { |
| 60 | + for ( $byte_3 = 128; $byte_3 <= 255; $byte_3 += 1 ) { |
| 61 | + $value = chr( $byte_1 ) . chr( $byte_2 ) . chr( $byte_3 ); |
| 62 | + $is_valid = preg_match( '/^[\x{0080}-\x{ffff}]$/u', $value ); |
| 63 | + $lexer = new WP_MySQL_Lexer( $value ); |
| 64 | + $type = $lexer->next_token()->get_type(); |
| 65 | + if ( $is_valid ) { |
| 66 | + $this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $type ); |
| 67 | + } else { |
| 68 | + $this->assertSame( WP_MySQL_Lexer::INVALID_INPUT, $type ); |
| 69 | + } |
| 70 | + } |
| 71 | + } |
| 72 | + } |
| 73 | + } |
| 74 | + |
6 | 75 | /** |
7 | 76 | * Numbers vs. identifiers: |
8 | 77 | * |
|
0 commit comments