|
2 | 2 |
|
3 | 3 | from typing import Any, List, Dict, Type, Optional, Sequence, Tuple, cast, Iterable, Callable |
4 | 4 |
|
| 5 | +from sqlglot import column |
| 6 | + |
5 | 7 | from dlt.common import logger |
6 | 8 | from dlt.common.destination.capabilities import DestinationCapabilitiesContext |
7 | 9 | from dlt.common.destination.typing import PreparedTableSchema |
@@ -301,78 +303,6 @@ class WithTableReflectionAndSql(WithTableReflection, WithSqlClient): |
301 | 303 | pass |
302 | 304 |
|
303 | 305 |
|
304 | | -def get_removed_table_columns( |
305 | | - escape_col_f: Callable[[str, bool, bool], str], |
306 | | - schema: Schema, |
307 | | - table_name: str, |
308 | | - actual_col_names: set[str], |
309 | | - disregard_dlt_columns: bool = True, |
310 | | -) -> TPartialTableSchema: |
311 | | - """Compares dlt schema with destination table schema and returns columns that appear to be missing. |
312 | | -
|
313 | | - This function identifies columns that exist in the dlt schema but are missing from the actual |
314 | | - destination table. It's used during schema synchronization to detect when columns may have |
315 | | - been dropped from the destination and need to be removed from the dlt schema as well. |
316 | | -
|
317 | | - However, dlt internal columns (_dlt_id, _dlt_load_id) are treated specially because |
318 | | - users rarely drop dlt internal columns manually, and if they did, |
319 | | - dlt cannot recover from this situation anyway. |
320 | | -
|
321 | | - Args: |
322 | | - client (WithTableReflectionAndSql): The destination client with table reflection capabilities. |
323 | | - schema (Schema): The dlt schema to compare against the destination. |
324 | | - table_name (str): Name of the table to analyze. |
325 | | - actual_col_names (set[str]): Column names that actually exist in the destination table, |
326 | | - typically obtained from INFORMATION_SCHEMA queries. For Athena, |
327 | | - this may not include dlt columns present in the underlying data files. |
328 | | - disregard_dlt_columns: Whether to ignore apparent mismatches for dlt internal |
329 | | - columns (_dlt_id, _dlt_load_id). Defaults to True to prevent incorrect |
330 | | - removal of essential dlt columns from the schema. |
331 | | -
|
332 | | - Returns: |
333 | | - TPartialTableSchema: Returns a partial table schema containing columns that exist in the dlt schema |
334 | | - but are missing from the actual table. |
335 | | -
|
336 | | - Example: |
337 | | - If dlt schema has [user_id, name, _dlt_id, _dlt_load_id] but destination |
338 | | - INFORMATION_SCHEMA only shows [user_id, name], this function would return |
339 | | - an empty dict (assuming disregard_dlt_columns=True) rather than suggesting |
340 | | - the dlt columns should be dropped. |
341 | | - """ |
342 | | - col_schemas = schema.get_table_columns(table_name) |
343 | | - |
344 | | - # Transform dlt schema column names to destination format (e.g., 'id' -> 'ID' in Snowflake) |
345 | | - # to match against actual_col_names from INFORMATION_SCHEMA |
346 | | - # Keys: destination format, Values: original dlt schema names |
347 | | - escaped_to_dlt = {escape_col_f(col, False, True): col for col in col_schemas.keys()} |
348 | | - |
349 | | - possibly_dropped_col_names = set(escaped_to_dlt.keys()) - actual_col_names |
350 | | - |
351 | | - if not possibly_dropped_col_names: |
352 | | - return {} |
353 | | - |
354 | | - partial_table: TPartialTableSchema = {"name": table_name, "columns": {}} |
355 | | - |
356 | | - for esc_name in possibly_dropped_col_names: |
357 | | - name_in_dlt = escaped_to_dlt[esc_name] |
358 | | - |
359 | | - if disregard_dlt_columns and schema.is_dlt_entity(name_in_dlt): |
360 | | - continue |
361 | | - |
362 | | - col_schema = col_schemas[name_in_dlt] |
363 | | - if col_schema.get("increment"): |
364 | | - # We can warn within the for loop, |
365 | | - # since there's only one incremental field per table |
366 | | - logger.warning( |
367 | | - f"An incremental field {name_in_dlt} is being removed from schema." |
368 | | - "You should unset the" |
369 | | - " incremental with `incremental=dlt.sources.incremental.EMPTY`" |
370 | | - ) |
371 | | - partial_table["columns"][name_in_dlt] = col_schema |
372 | | - |
373 | | - return partial_table if partial_table["columns"] else {} |
374 | | - |
375 | | - |
376 | 306 | def sync_schema_from_storage_schema( |
377 | 307 | get_storage_tables_f: Callable[[Iterable[str]], Iterable[tuple[str, dict[str, TColumnSchema]]]], |
378 | 308 | escape_col_f: Callable[[str, bool, bool], str], |
@@ -416,13 +346,17 @@ def sync_schema_from_storage_schema( |
416 | 346 | # we compare actual column schemas with dlt ones -> |
417 | 347 | # we take the difference as a partial table |
418 | 348 | else: |
419 | | - partial_table = get_removed_table_columns( |
420 | | - escape_col_f, |
421 | | - schema, |
422 | | - table_name, |
423 | | - set(actual_col_schemas.keys()), |
| 349 | + removed_columns = schema.get_removed_table_columns( |
| 350 | + table_name=table_name, |
| 351 | + existing_columns=actual_col_schemas, |
| 352 | + escape_col_f=escape_col_f, |
| 353 | + disregard_dlt_columns=True, |
424 | 354 | ) |
425 | | - if partial_table: |
| 355 | + if removed_columns: |
| 356 | + partial_table: TPartialTableSchema = { |
| 357 | + "name": table_name, |
| 358 | + "columns": {col["name"]: col for col in removed_columns}, |
| 359 | + } |
426 | 360 | column_drops[table_name] = partial_table |
427 | 361 |
|
428 | 362 | # 2. For entire table drops, we make sure no orphaned tables remain |
|
0 commit comments