Skip to content

Commit f1c56cc

Browse files
committed
Fix number vs identifier matching, add some tests
1 parent 64e2068 commit f1c56cc

File tree

4 files changed

+76
-37
lines changed

4 files changed

+76
-37
lines changed

custom-parser/parser/MySQLLexer.php

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2452,7 +2452,7 @@ protected function NUMBER()
24522452
$this->emitDot();
24532453
$this->type = self::IDENTIFIER;
24542454
}
2455-
} elseif ($this->c === 'e' || $this->c === 'E') {
2455+
} elseif (($this->c === 'e' || $this->c === 'E') && ($this->n === '+' || $this->n === '-' || $this->isDigit($this->n))) {
24562456
$this->consume();
24572457
if ($this->c === '+' || $this->c === '-') {
24582458
$this->consume();
@@ -2463,6 +2463,21 @@ protected function NUMBER()
24632463
$this->type = self::FLOAT_NUMBER;
24642464
}
24652465
}
2466+
2467+
// In MySQL, when an input matches both a number and an identifier, the number always wins.
2468+
// However, when the number is followed by a non-numeric identifier-like character, it is
2469+
// considered an identifier... unless it's a float number, which ignores subsequent input.
2470+
$possibleIdentifierPrefix =
2471+
$this->type === self::INT_NUMBER
2472+
|| ($this->text[0] === '0' && ($this->text[1] === 'b' || $this->text[1] === 'x'));
2473+
2474+
if ($possibleIdentifierPrefix && preg_match('/\G' . self::PATTERN_UNQUOTED_IDENTIFIER . '/u', $this->input, $matches, 0, $this->position)) {
2475+
$this->text .= $matches[0];
2476+
$this->position += strlen($matches[0]);
2477+
$this->c = $this->input[$this->position] ?? null;
2478+
$this->n = $this->input[$this->position + 1] ?? null;
2479+
$this->type = self::IDENTIFIER;
2480+
}
24662481
}
24672482

24682483
protected function SINGLE_QUOTED_TEXT()

tests/parser/data/failures.csv

Lines changed: 0 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -328,10 +328,7 @@ ALTER TABLE t1 ALTER CONSTRAINT f2_check NOT ENFORCED"
328328
ALTER TABLE t1 ALTER CONSTRAINT constraint_name NOT ENFORCED"
329329
"ALTER TABLE tmp ALTER CONSTRAINT f2_check NOT ENFORCED"
330330
"create table t1 like `a/a`"
331-
"create table 1ea10 (1a20 int,1e int)"
332-
"insert into 1ea10 values(1,1)"
333331
"select 1ea10.1a20,1e+ 1e+10 from 1ea10"
334-
"drop table 1ea10"
335332
"create table t3 like t1"
336333
"create table if not exists t3 like t1"
337334
"create temporary table t3 like t2"
@@ -594,8 +591,6 @@ WHERE dt1.x = dt2.x"
594591
"SHOW REPLICA STATUS"
595592
"SHOW BINARY LOG STATUS"
596593
"SHOW REPLICAS"
597-
"create table t(id int, $id int, $id2 int, `$$id` int, $ int, $1 int,
598-
`$$$` int, id$$$ int, 1$ int, `$$` int, _$ int, b$$lit$$ int)"
599594
"EOF
600595

601596
DROP DATABASE bug19573998"
@@ -757,18 +752,6 @@ WHERE innr1.col_int_key= 4)"
757752
gc INTEGER GENERATED ALWAYS AS (x LIKE 'abba' ESCAPE 'b'))"
758753
exit
759754
"SELECT r FROM t LEFT JOIN LATERAL (SELECT i, RAND(0) AS r) AS dt ON TRUE"
760-
"SELECT TIMEDIFF(TIME('17:00:00'),TIME('17:00:00'))=TIME('00:00:00') AS 1Eq,
761-
TIMEDIFF(TIME('17:59:00'),TIME('17:00:00'))=TIME('00:00:00') AS 1NEq1,
762-
TIMEDIFF(TIME('18:00:00'),TIME('17:00:00'))=TIME('00:00:00') AS 1NEq2,
763-
TIMEDIFF(TIME('17:00:00'),TIME('17:00:00'))= '00:00:00' AS 2Eq,
764-
TIMEDIFF(TIME('17:59:00'),TIME('17:00:00'))= '00:00:00' AS 2NEq1,
765-
TIMEDIFF(TIME('18:00:00'),TIME('17:00:00'))= '00:00:00' AS 2NEq2,
766-
TIMEDIFF(TIME('17:00:00'),TIME('17:00:00'))=TIME(0) AS 3Eq,
767-
TIMEDIFF(TIME('17:59:00'),TIME('17:00:00'))=TIME(0) AS 3NEq1,
768-
TIMEDIFF(TIME('18:00:00'),TIME('17:00:00'))=TIME(0) AS 3NEq2,
769-
TIME(0) AS Time0, TIME('00:00:00') AS Time00, '00:00:00' AS Literal0000,
770-
TIMEDIFF(TIME('17:59:00'),TIME('17:00:00')),
771-
TIMEDIFF(TIME('17:00:00'),TIME('17:59:00'))"
772755
"""',repeat('a',60),repeat('b',60),repeat('c',60),repeat('d',100)), '""');
773756

774757
select insert('txs',2,1,'hi'),insert('is ',4,0,'a'),insert('txxxxt',2,4,'es');
@@ -1060,14 +1043,6 @@ boundary POLYGON)"
10601043
"create table test.t_duplicated like mysqltest_db1.t_select_priv"
10611044
"flush table mysqltest_db1.t1"
10621045
"ALTER USER user() IDENTIFIED BY 'abc'"
1063-
"CREATE USER 20553132_u1@localhost"
1064-
"CREATE USER 20553132_u2@localhost"
1065-
"GRANT ALL ON *.* TO 20553132_u1@localhost"
1066-
"ALTER USER 20553132_u1@localhost PASSWORD EXPIRE"
1067-
"ALTER USER 20553132_u2@localhost IDENTIFIED BY 'abcd', 20553132_u1@localhost IDENTIFIED BY 'defg' PASSWORD EXPIRE NEVER"
1068-
"ALTER USER 20553132_u2@localhost IDENTIFIED BY 'abcd', 20553132_u1@localhost IDENTIFIED WITH 'caching_sha2_password' BY 'hijk' PASSWORD EXPIRE DEFAULT"
1069-
"DROP USER 20553132_u1@localhost"
1070-
"DROP USER 20553132_u2@localhost"
10711046
"EOF
10721047

10731048

@@ -1508,10 +1483,6 @@ KEY k4 (f,g) -- 'f' and 'g' are independet.
15081483
"EXPLAIN FORMAT=JSON INTO @explain_output
15091484
SELECT a FROM t1 x2 WHERE x2.b IN
15101485
(SELECT a FROM t1 x1 WHERE x1.b IN (SELECT a FROM t1 x0))"
1511-
"INSERT INTO t (a)
1512-
WITH RECURSIVE 150tup(n) AS
1513-
(SELECT 1 UNION ALL SELECT n + 1 FROM 150tup WHERE n < 150)
1514-
SELECT n FROM 150tup"
15151486
"EXPLAIN FORMAT=TREE
15161487
SELECT /*+ SET_VAR(optimizer_max_subgraph_pairs = 1) */ COUNT(*)
15171488
FROM t AS t1 LEFT JOIN t AS t2 ON TRUE,
@@ -3735,12 +3706,6 @@ DECLARE a int;
37353706
SELECT 1 INTO a;
37363707
RETURN a;
37373708
END "
3738-
"create procedure 15298_1 () sql security definer show grants for current_user"
3739-
"create procedure 15298_2 () sql security definer show grants"
3740-
"call 15298_1()"
3741-
"call 15298_2()"
3742-
"drop procedure 15298_1"
3743-
"drop procedure 15298_2"
37443709
"CREATE EVENT e1 ON SCHEDULE EVERY 1 SECOND STARTS NOW() DO
37453710
BEGIN
37463711
DECLARE EXIT HANDLER FOR 1136 BEGIN

tests/parser/data/stats.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
Total: 66241 | Failures: 1896 / 2% | Exceptions: 0 / 0%
1+
Total: 66241 | Failures: 1876 / 2% | Exceptions: 0 / 0%

tests/parser/run-lexer-tests.php

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,62 @@
2323
}
2424

2525
echo "Tokenized $i queries in ", microtime(true) - $start, 's', PHP_EOL;
26+
27+
// Add some manual tests
28+
$tests = [
29+
/**
30+
* Numbers vs. identifiers:
31+
*
32+
* In MySQL, when an input matches both a number and an identifier, the number always wins.
33+
* However, when the number is followed by a non-numeric identifier-like character, it is
34+
* considered an identifier... unless it's a float number, which ignores subsequent input.
35+
*/
36+
37+
// INT numbers vs. identifiers
38+
'123' => ['INT_NUMBER', 'EOF'],
39+
'123abc' => ['IDENTIFIER', 'EOF'], // identifier
40+
41+
// BIN numbers vs. identifiers
42+
'0b01' => ['BIN_NUMBER', 'EOF'],
43+
'0b01xyz' => ['IDENTIFIER', 'EOF'], // identifier
44+
"b'01'" => ['BIN_NUMBER', 'EOF'],
45+
"b'01xyz'" => ['BIN_NUMBER', 'IDENTIFIER', 'INVALID_INPUT', 'EOF'],
46+
47+
// HEX numbers vs. identifiers
48+
'0xab01' => ['HEX_NUMBER', 'EOF'],
49+
'0xab01xyz' => ['IDENTIFIER', 'EOF'], // identifier
50+
"x'ab01'" => ['HEX_NUMBER', 'EOF'],
51+
"x'ab01xyz'" => ['HEX_NUMBER', 'IDENTIFIER', 'INVALID_INPUT', 'EOF'],
52+
53+
// DECIMAL numbers vs. identifiers
54+
'123.456' => ['DECIMAL_NUMBER', 'EOF'],
55+
'.123' => ['DECIMAL_NUMBER', 'EOF'],
56+
'123.456abc' => ['DECIMAL_NUMBER', 'IDENTIFIER', 'EOF'], // not identifier
57+
'.123abc' => ['DECIMAL_NUMBER', 'IDENTIFIER', 'EOF'], // not identifier
58+
59+
// FLOAT numbers vs. identifiers
60+
'1e10' => ['FLOAT_NUMBER', 'EOF'],
61+
'1e+10' => ['FLOAT_NUMBER', 'EOF'],
62+
'1e-10' => ['FLOAT_NUMBER', 'EOF'],
63+
'1e10abc' => ['FLOAT_NUMBER', 'IDENTIFIER', 'EOF'], // not identifier (this differs from INT/BIN/HEX numbers)
64+
'1e+10abc' => ['FLOAT_NUMBER', 'IDENTIFIER', 'EOF'], // not identifier
65+
'1e-10abc' => ['FLOAT_NUMBER', 'IDENTIFIER', 'EOF'], // not identifier
66+
];
67+
68+
$failures = 0;
69+
foreach ($tests as $input => $expected) {
70+
$tokens = tokenizeQuery($input);
71+
$token_names = array_map(function ($token) {
72+
return $token->getName();
73+
}, $tokens);
74+
if ($token_names !== $expected) {
75+
$failures += 1;
76+
echo "\nFailed test for input: $input\n";
77+
echo " Expected: ", implode(', ', $expected), "\n";
78+
echo " Actual: ", implode(', ', $token_names), "\n";
79+
}
80+
}
81+
if ($failures > 0) {
82+
echo "\n$failures tests failed!\n";
83+
}
84+

0 commit comments

Comments
 (0)