Skip to content

Commit 8134870

Browse files
committed
Naive query stream parser
1 parent 8d63ed0 commit 8134870

File tree

3 files changed

+276
-0
lines changed

3 files changed

+276
-0
lines changed

tests/bootstrap.php

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
require_once __DIR__ . '/../wp-includes/mysql/class-wp-mysql-token.php';
1010
require_once __DIR__ . '/../wp-includes/mysql/class-wp-mysql-lexer.php';
1111
require_once __DIR__ . '/../wp-includes/mysql/class-wp-mysql-parser.php';
12+
require_once __DIR__ . '/../wp-includes/mysql/class-wp-mysql-naive-query-stream.php';
1213
require_once __DIR__ . '/../wp-includes/sqlite/class-wp-sqlite-query-rewriter.php';
1314
require_once __DIR__ . '/../wp-includes/sqlite/class-wp-sqlite-lexer.php';
1415
require_once __DIR__ . '/../wp-includes/sqlite/class-wp-sqlite-token.php';
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
<?php
2+
3+
use PHPUnit\Framework\TestCase;
4+
5+
class WP_MySQL_Naive_Query_Stream_Tests extends TestCase {
6+
7+
public function test_next_query_returns_a_single_delimited_query(): void {
8+
$stream = new WP_MySQL_Naive_Query_Stream();
9+
$stream->append_sql( 'SELECT id FROM users;' );
10+
$this->assertTrue( $stream->next_query() );
11+
$this->assertSame( 'SELECT id FROM users;', $stream->get_query() );
12+
}
13+
14+
public function test_next_query_returns_false_if_the_input_is_incomplete(): void {
15+
$stream = new WP_MySQL_Naive_Query_Stream();
16+
$stream->append_sql( 'SELECT id FROM users' );
17+
$this->assertFalse( $stream->next_query() );
18+
}
19+
20+
public function test_next_query_returns_true_if_the_input_is_complete_but_undelimited(): void {
21+
$stream = new WP_MySQL_Naive_Query_Stream();
22+
$stream->append_sql( 'SELECT id FROM users' );
23+
$stream->mark_input_complete();
24+
$this->assertTrue( $stream->next_query() );
25+
$this->assertSame( 'SELECT id FROM users', $stream->get_query() );
26+
}
27+
28+
public function test_next_query_parses_multiple_queries_with_even_appends(): void {
29+
$stream = new WP_MySQL_Naive_Query_Stream();
30+
$stream->append_sql( 'SELECT id FROM users; SELECT name FROM users2;' );
31+
32+
$this->assertTrue( $stream->next_query() );
33+
$this->assertSame( 'SELECT id FROM users;', $stream->get_query() );
34+
35+
$this->assertTrue( $stream->next_query() );
36+
$this->assertSame( ' SELECT name FROM users2;', $stream->get_query() );
37+
38+
$this->assertFalse( $stream->next_query() );
39+
40+
$stream->append_sql( 'SELECT name FROM users3;' );
41+
$this->assertTrue( $stream->next_query() );
42+
$this->assertSame( 'SELECT name FROM users3;', $stream->get_query() );
43+
44+
$this->assertFalse( $stream->next_query() );
45+
}
46+
47+
public function test_next_query_parses_multiple_queries_with_uneven_appends(): void {
48+
$stream = new WP_MySQL_Naive_Query_Stream();
49+
$stream->append_sql( 'SELECT id FROM ' );
50+
51+
$this->assertFalse( $stream->next_query() );
52+
53+
$stream->append_sql( 'users; SELECT name ' );
54+
$this->assertTrue( $stream->next_query() );
55+
$this->assertSame( 'SELECT id FROM users;', $stream->get_query() );
56+
57+
$this->assertFalse( $stream->next_query() );
58+
$stream->append_sql( ', id FROM users2; INSERT' );
59+
$this->assertTrue( $stream->next_query() );
60+
$this->assertSame( ' SELECT name , id FROM users2;', $stream->get_query() );
61+
62+
$this->assertFalse( $stream->next_query() );
63+
64+
$stream->append_sql( ' INTO users3 VALUES (1, 2)' );
65+
$stream->mark_input_complete();
66+
$this->assertTrue( $stream->next_query() );
67+
$this->assertSame( ' INSERT INTO users3 VALUES (1, 2)', $stream->get_query() );
68+
}
69+
70+
public function test_next_query_parses_queries_with_trailing_block_comments_included(): void {
71+
$stream = new WP_MySQL_Naive_Query_Stream();
72+
$stream->append_sql( 'SELECT id FROM users /* foo */' );
73+
$stream->mark_input_complete();
74+
75+
$this->assertTrue( $stream->next_query() );
76+
$this->assertSame( 'SELECT id FROM users /* foo */', $stream->get_query() );
77+
78+
$this->assertFalse( $stream->next_query() );
79+
}
80+
81+
public function test_next_query_parses_queries_with_trailing_block_comments_excluded(): void {
82+
$stream = new WP_MySQL_Naive_Query_Stream();
83+
$stream->append_sql( 'SELECT id FROM users; /* foo */' );
84+
$stream->mark_input_complete();
85+
86+
$this->assertTrue( $stream->next_query() );
87+
$this->assertSame( 'SELECT id FROM users;', $stream->get_query() );
88+
89+
$this->assertFalse( $stream->next_query() );
90+
$this->assertEquals(WP_MySQL_Naive_Query_Stream::STATE_FINISHED, $stream->get_state());
91+
}
92+
93+
public function test_treats_too_large_input_as_a_syntax_error(): void {
94+
$five_megabytes = str_repeat( 'lorem ', 1024 * 1024 );
95+
96+
$stream = new WP_MySQL_Naive_Query_Stream();
97+
$stream->append_sql( $five_megabytes );
98+
$this->assertFalse( $stream->next_query() );
99+
$this->assertEquals(WP_MySQL_Naive_Query_Stream::STATE_SYNTAX_ERROR, $stream->get_state());
100+
}
101+
102+
public function test_next_query_returns_false_if_the_input_has_a_syntax_error(): void {
103+
$this->markTestSkipped('This test is expected to fail because the naive query stream doesn\'t understand what a valid query is. It\'s just a heuristic that works for most cases.');
104+
105+
$stream = new WP_MySQL_Naive_Query_Stream();
106+
$stream->append_sql( 'SELECT id FROM users WHERE id = ihj' );
107+
$stream->mark_input_complete();
108+
$this->assertFalse( $stream->next_query() );
109+
}
110+
}
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
<?php
2+
3+
/**
4+
* Naively splits an SQL string into a sequence of queries. It
5+
* streams the data so you can process very large chunks of SQL
6+
* without running out of memory.
7+
*
8+
* This class is **naive** because it doesn't understand what a
9+
* valid query is. The lexer does not provide a way to distinguish
10+
* between a syntax error and an incomplete input yet. Lacking this
11+
* information, we assume that no SQL query is larger than 2MB and,
12+
* failing to extract a query from a 2MB buffer, we fail. This heuristic
13+
* is often sufficient, but may fail in pathological cases.
14+
*
15+
* Usage:
16+
*
17+
* $stream = new WP_MySQL_Naive_Query_Stream();
18+
* $stream->append_sql( 'SELECT id FROM users; SELECT * FROM posts;' );
19+
* while ( $stream->next_query() ) {
20+
* $sql_string = $stream->get_query();
21+
* // Process the query.
22+
* }
23+
* $stream->append_sql( 'CREATE TABLE users (id INT, name VARCHAR(255));' );
24+
* while ( $stream->next_query() ) {
25+
* $sql_string = $stream->get_query();
26+
* // Process the query.
27+
* }
28+
* $stream->mark_input_complete();
29+
* $stream->next_query(); // returns false
30+
*/
31+
class WP_MySQL_Naive_Query_Stream {
32+
33+
private $sql_buffer = '';
34+
private $input_complete = false;
35+
private $state = true;
36+
private $last_query = false;
37+
38+
const STATE_QUERY = 'valid';
39+
const STATE_SYNTAX_ERROR = 'syntax_error';
40+
const STATE_PAUSED_ON_INCOMPLETE_INPUT = 'paused_on_incomplete_input';
41+
const STATE_FINISHED = 'finished';
42+
43+
/**
44+
* The maximum size of the buffer to store the SQL input. We don't
45+
* have enough information from the lexer to distinguish between
46+
* an incomplete input and a syntax error so we use a heuristic –
47+
* if we've accumulated more than this amount of SQL input, we assume
48+
* it's a syntax error. That's why this class is called a "naive" query
49+
* stream.
50+
*/
51+
const MAX_SQL_BUFFER_SIZE = 1024 * 1024 * 2;
52+
53+
public function __construct() {}
54+
55+
public function append_sql( string $sql ) {
56+
if($this->input_complete) {
57+
return false;
58+
}
59+
$this->sql_buffer .= $sql;
60+
$this->state = self::STATE_QUERY;
61+
return true;
62+
}
63+
64+
public function is_paused_on_incomplete_input(): bool {
65+
return $this->state === self::STATE_PAUSED_ON_INCOMPLETE_INPUT;
66+
}
67+
68+
public function mark_input_complete() {
69+
$this->input_complete = true;
70+
}
71+
72+
public function next_query() {
73+
$this->last_query = false;
74+
if($this->state === self::STATE_PAUSED_ON_INCOMPLETE_INPUT) {
75+
return false;
76+
}
77+
78+
$result = $this->do_next_query();
79+
if(!$result && strlen($this->sql_buffer) > self::MAX_SQL_BUFFER_SIZE) {
80+
$this->state = self::STATE_SYNTAX_ERROR;
81+
return false;
82+
}
83+
return $result;
84+
}
85+
86+
private function do_next_query() {
87+
88+
$query = [];
89+
$lexer = new WP_MySQL_Lexer( $this->sql_buffer );
90+
while ( $lexer->next_token() ) {
91+
$token = $lexer->get_token();
92+
$query[] = $token;
93+
if ( $token->id === WP_MySQL_Lexer::SEMICOLON_SYMBOL ) {
94+
// Got a complete query!
95+
break;
96+
}
97+
}
98+
99+
// @TODO: expose this method from the lexer
100+
// if($lexer->get_state() === WP_MySQL_Lexer::STATE_SYNTAX_ERROR) {
101+
// return false;
102+
// }
103+
104+
if(!count($query)) {
105+
if ( $this->input_complete ) {
106+
$this->state = self::STATE_FINISHED;
107+
} else {
108+
$this->state = self::STATE_PAUSED_ON_INCOMPLETE_INPUT;
109+
}
110+
return false;
111+
}
112+
113+
// The last token either needs to end with a semicolon, or be the
114+
// last token in the input.
115+
$last_token = $query[count($query) - 1];
116+
if (
117+
$last_token->id !== WP_MySQL_Lexer::SEMICOLON_SYMBOL &&
118+
! $this->input_complete
119+
) {
120+
$this->state = self::STATE_PAUSED_ON_INCOMPLETE_INPUT;
121+
return false;
122+
}
123+
124+
// See if the query has any meaningful tokens. We don't want to return
125+
// to give the caller a comment disguised as a query.
126+
$has_meaningful_tokens = false;
127+
foreach($query as $token) {
128+
if (
129+
$token->id !== WP_MySQL_Lexer::WHITESPACE &&
130+
$token->id !== WP_MySQL_Lexer::COMMENT &&
131+
$token->id !== WP_MySQL_Lexer::MYSQL_COMMENT_START &&
132+
$token->id !== WP_MySQL_Lexer::MYSQL_COMMENT_END &&
133+
$token->id !== WP_MySQL_Lexer::EOF
134+
) {
135+
$has_meaningful_tokens = true;
136+
break;
137+
}
138+
}
139+
if(!$has_meaningful_tokens) {
140+
if ( $this->input_complete ) {
141+
$this->state = self::STATE_FINISHED;
142+
} else {
143+
$this->state = self::STATE_PAUSED_ON_INCOMPLETE_INPUT;
144+
}
145+
return false;
146+
}
147+
148+
// Remove the query from the input buffer and return it.
149+
$last_byte = $last_token->start + $last_token->length;
150+
$query = substr($this->sql_buffer, 0, $last_byte);
151+
$this->sql_buffer = substr($this->sql_buffer, $last_byte);
152+
$this->last_query = $query;
153+
$this->state = self::STATE_QUERY;
154+
return true;
155+
}
156+
157+
public function get_query() {
158+
return $this->last_query;
159+
}
160+
161+
public function get_state() {
162+
return $this->state;
163+
}
164+
165+
}

0 commit comments

Comments
 (0)