|
| 1 | +<?php |
| 2 | + |
| 3 | +/** |
| 4 | + * Naively splits an SQL string into a sequence of queries. It |
| 5 | + * streams the data so you can process very large chunks of SQL |
| 6 | + * without running out of memory. |
| 7 | + * |
| 8 | + * This class is **naive** because it doesn't understand what a |
| 9 | + * valid query is. The lexer does not provide a way to distinguish |
| 10 | + * between a syntax error and an incomplete input yet. Lacking this |
| 11 | + * information, we assume that no SQL query is larger than 2MB and, |
| 12 | + * failing to extract a query from a 2MB buffer, we fail. This heuristic |
| 13 | + * is often sufficient, but may fail in pathological cases. |
| 14 | + * |
| 15 | + * Usage: |
| 16 | + * |
| 17 | + * $stream = new WP_MySQL_Naive_Query_Stream(); |
| 18 | + * $stream->append_sql( 'SELECT id FROM users; SELECT * FROM posts;' ); |
| 19 | + * while ( $stream->next_query() ) { |
| 20 | + * $sql_string = $stream->get_query(); |
| 21 | + * // Process the query. |
| 22 | + * } |
| 23 | + * $stream->append_sql( 'CREATE TABLE users (id INT, name VARCHAR(255));' ); |
| 24 | + * while ( $stream->next_query() ) { |
| 25 | + * $sql_string = $stream->get_query(); |
| 26 | + * // Process the query. |
| 27 | + * } |
| 28 | + * $stream->mark_input_complete(); |
| 29 | + * $stream->next_query(); // returns false |
| 30 | + */ |
| 31 | +class WP_MySQL_Naive_Query_Stream { |
| 32 | + |
| 33 | + private $sql_buffer = ''; |
| 34 | + private $input_complete = false; |
| 35 | + private $state = true; |
| 36 | + private $last_query = false; |
| 37 | + |
| 38 | + const STATE_QUERY = 'valid'; |
| 39 | + const STATE_SYNTAX_ERROR = 'syntax_error'; |
| 40 | + const STATE_PAUSED_ON_INCOMPLETE_INPUT = 'paused_on_incomplete_input'; |
| 41 | + const STATE_FINISHED = 'finished'; |
| 42 | + |
| 43 | + /** |
| 44 | + * The maximum size of the buffer to store the SQL input. We don't |
| 45 | + * have enough information from the lexer to distinguish between |
| 46 | + * an incomplete input and a syntax error so we use a heuristic – |
| 47 | + * if we've accumulated more than this amount of SQL input, we assume |
| 48 | + * it's a syntax error. That's why this class is called a "naive" query |
| 49 | + * stream. |
| 50 | + */ |
| 51 | + const MAX_SQL_BUFFER_SIZE = 1024 * 1024 * 2; |
| 52 | + |
| 53 | + public function __construct() {} |
| 54 | + |
| 55 | + public function append_sql( string $sql ) { |
| 56 | + if($this->input_complete) { |
| 57 | + return false; |
| 58 | + } |
| 59 | + $this->sql_buffer .= $sql; |
| 60 | + $this->state = self::STATE_QUERY; |
| 61 | + return true; |
| 62 | + } |
| 63 | + |
| 64 | + public function is_paused_on_incomplete_input(): bool { |
| 65 | + return $this->state === self::STATE_PAUSED_ON_INCOMPLETE_INPUT; |
| 66 | + } |
| 67 | + |
| 68 | + public function mark_input_complete() { |
| 69 | + $this->input_complete = true; |
| 70 | + } |
| 71 | + |
| 72 | + public function next_query() { |
| 73 | + $this->last_query = false; |
| 74 | + if($this->state === self::STATE_PAUSED_ON_INCOMPLETE_INPUT) { |
| 75 | + return false; |
| 76 | + } |
| 77 | + |
| 78 | + $result = $this->do_next_query(); |
| 79 | + if(!$result && strlen($this->sql_buffer) > self::MAX_SQL_BUFFER_SIZE) { |
| 80 | + $this->state = self::STATE_SYNTAX_ERROR; |
| 81 | + return false; |
| 82 | + } |
| 83 | + return $result; |
| 84 | + } |
| 85 | + |
| 86 | + private function do_next_query() { |
| 87 | + |
| 88 | + $query = []; |
| 89 | + $lexer = new WP_MySQL_Lexer( $this->sql_buffer ); |
| 90 | + while ( $lexer->next_token() ) { |
| 91 | + $token = $lexer->get_token(); |
| 92 | + $query[] = $token; |
| 93 | + if ( $token->id === WP_MySQL_Lexer::SEMICOLON_SYMBOL ) { |
| 94 | + // Got a complete query! |
| 95 | + break; |
| 96 | + } |
| 97 | + } |
| 98 | + |
| 99 | + // @TODO: expose this method from the lexer |
| 100 | + // if($lexer->get_state() === WP_MySQL_Lexer::STATE_SYNTAX_ERROR) { |
| 101 | + // return false; |
| 102 | + // } |
| 103 | + |
| 104 | + if(!count($query)) { |
| 105 | + if ( $this->input_complete ) { |
| 106 | + $this->state = self::STATE_FINISHED; |
| 107 | + } else { |
| 108 | + $this->state = self::STATE_PAUSED_ON_INCOMPLETE_INPUT; |
| 109 | + } |
| 110 | + return false; |
| 111 | + } |
| 112 | + |
| 113 | + // The last token either needs to end with a semicolon, or be the |
| 114 | + // last token in the input. |
| 115 | + $last_token = $query[count($query) - 1]; |
| 116 | + if ( |
| 117 | + $last_token->id !== WP_MySQL_Lexer::SEMICOLON_SYMBOL && |
| 118 | + ! $this->input_complete |
| 119 | + ) { |
| 120 | + $this->state = self::STATE_PAUSED_ON_INCOMPLETE_INPUT; |
| 121 | + return false; |
| 122 | + } |
| 123 | + |
| 124 | + // See if the query has any meaningful tokens. We don't want to return |
| 125 | + // to give the caller a comment disguised as a query. |
| 126 | + $has_meaningful_tokens = false; |
| 127 | + foreach($query as $token) { |
| 128 | + if ( |
| 129 | + $token->id !== WP_MySQL_Lexer::WHITESPACE && |
| 130 | + $token->id !== WP_MySQL_Lexer::COMMENT && |
| 131 | + $token->id !== WP_MySQL_Lexer::MYSQL_COMMENT_START && |
| 132 | + $token->id !== WP_MySQL_Lexer::MYSQL_COMMENT_END && |
| 133 | + $token->id !== WP_MySQL_Lexer::EOF |
| 134 | + ) { |
| 135 | + $has_meaningful_tokens = true; |
| 136 | + break; |
| 137 | + } |
| 138 | + } |
| 139 | + if(!$has_meaningful_tokens) { |
| 140 | + if ( $this->input_complete ) { |
| 141 | + $this->state = self::STATE_FINISHED; |
| 142 | + } else { |
| 143 | + $this->state = self::STATE_PAUSED_ON_INCOMPLETE_INPUT; |
| 144 | + } |
| 145 | + return false; |
| 146 | + } |
| 147 | + |
| 148 | + // Remove the query from the input buffer and return it. |
| 149 | + $last_byte = $last_token->start + $last_token->length; |
| 150 | + $query = substr($this->sql_buffer, 0, $last_byte); |
| 151 | + $this->sql_buffer = substr($this->sql_buffer, $last_byte); |
| 152 | + $this->last_query = $query; |
| 153 | + $this->state = self::STATE_QUERY; |
| 154 | + return true; |
| 155 | + } |
| 156 | + |
| 157 | + public function get_query() { |
| 158 | + return $this->last_query; |
| 159 | + } |
| 160 | + |
| 161 | + public function get_state() { |
| 162 | + return $this->state; |
| 163 | + } |
| 164 | + |
| 165 | +} |
0 commit comments