2828)
2929
3030from dlt .destinations .exceptions import DatabaseTransientException
31+ from dlt .destinations .sql_client import WithSqlClient
3132from dlt .extract import DltResource , resource as make_resource , DltSource
3233
3334RE_DATA_TYPE = re .compile (r"([A-Z]+)\((\d+)(?:,\s?(\d+))?\)" )
@@ -299,68 +300,109 @@ def get_deterministic_temp_table_name(table_name: str, op: str) -> str:
299300 return f"{ op_name } _{ NamingConvention ._compute_tag (op_name , 0.001 )} "
300301
301302
303+ class WithTableReflectionAndSql (WithTableReflection , WithSqlClient ):
304+ pass
305+
306+
307+ def _diff_between_actual_and_dlt_schema (
308+ client : WithTableReflectionAndSql ,
309+ schema : Schema ,
310+ table_name : str ,
311+ actual_col_names : set [str ],
312+ disregard_dlt_columns : bool = True ,
313+ ) -> TPartialTableSchema :
314+ """Compares dlt schema with destination table schema and returns columns that appear to be missing.
315+
316+ This function identifies columns that exist in the dlt schema but are missing from the actual
317+ destination table. It's used during schema synchronization to detect when columns may have
318+ been dropped from the destination and need to be removed from the dlt schema as well.
319+
320+ However, dlt internal columns (_dlt_id, _dlt_load_id) are treated specially because:
321+
322+ 1. Users rarely drop dlt internal columns manually, and if they did,
323+ dlt cannot recover from this situation anyway.
324+
325+ 2. Athena has a constraint where dlt columns exist in the data but not in the table metadata:
326+
327+ - Athena external tables have fixed schemas defined at CREATE TABLE time
328+ - These columns exist in the actual data files but don't appear in INFORMATION_SCHEMA
329+ - This causes false positives where dlt columns appear "missing" when they're not
330+
331+ Args:
332+ client (WithTableReflectionAndSql): The destination client with table reflection capabilities.
333+ schema (Schema): The dlt schema to compare against the destination.
334+ table_name (str): Name of the table to analyze.
335+ actual_col_names (set[str]): Column names that actually exist in the destination table,
336+ typically obtained from INFORMATION_SCHEMA queries. For Athena,
337+ this may not include dlt columns present in the underlying data files.
338+ disregard_dlt_columns: Whether to ignore apparent mismatches for dlt internal
339+ columns (_dlt_id, _dlt_load_id). Defaults to True to prevent incorrect
340+ removal of essential dlt columns from the schema.
341+
342+ Returns:
343+ TPartialTableSchema: Returns a partial table schema containing columns that exist in the dlt schema
344+ but are missing from the actual table.
345+
346+ Example:
347+ If dlt schema has [user_id, name, _dlt_id, _dlt_load_id] but destination
348+ INFORMATION_SCHEMA only shows [user_id, name], this function would return
349+ an empty dict (assuming disregard_dlt_columns=True) rather than suggesting
350+ the dlt columns should be dropped.
351+ """
352+ col_schemas = schema .get_table_columns (table_name )
353+
354+ # Map escaped (like actual_col_names) -> original names (what appears in the dlt schema)
355+ escaped_to_dlt = {
356+ client .sql_client .escape_column_name (col , quote = False ): col for col in col_schemas .keys ()
357+ }
358+
359+ possibly_dropped_col_names = set (escaped_to_dlt .keys ()) - actual_col_names
360+
361+ if not possibly_dropped_col_names :
362+ return {}
363+
364+ partial_table : TPartialTableSchema = {"name" : table_name , "columns" : {}}
365+
366+ for esc_name in possibly_dropped_col_names :
367+ name_in_dlt = escaped_to_dlt [esc_name ]
368+
369+ if disregard_dlt_columns and name_in_dlt in [C_DLT_ID , C_DLT_LOAD_ID ]:
370+ continue
371+
372+ col_schema = col_schemas [name_in_dlt ]
373+ if col_schema .get ("increment" ):
374+ # We can warn within the for loop,
375+ # since there's only one incremental field per table
376+ logger .warning (
377+ f"An incremental field { name_in_dlt } is being removed from schema."
378+ "You should unset the"
379+ " incremental with `incremental=dlt.sources.incremental.EMPTY`"
380+ )
381+ partial_table ["columns" ][name_in_dlt ] = col_schema
382+
383+ return partial_table if partial_table ["columns" ] else {}
384+
385+
302386def update_dlt_schema (
303- client : WithTableReflection ,
387+ client : WithTableReflectionAndSql ,
304388 schema : Schema ,
305389 table_names : Iterable [str ] = None ,
306390 dry_run : bool = False ,
307391) -> Optional [TSchemaDrop ]:
308- """Updates schema to the storage .
392+ """Updates the dlt schema from destination .
309393
310394 Compare the schema we think we should have with what actually exists in the destination,
311395 and drop any tables and/or columns that disappeared.
312396
313397 Args:
398+ client (WithTableReflectionAndSql): The destination client with table reflection capabilities.
399+ schema (Schema): The dlt schema to compare against the destination.
314400 table_names (Iterable[str], optional): Check only listed tables. Defaults to None and checks all tables.
401+ dry_run (bool, optional): Whether to actually update the dlt schema. Defaults to False.
315402
316403 Returns:
317- Optional[TSchemaTables ]: Returns an update that was applied to the schema.
404+ Optional[TSchemaDrop ]: Returns the update that was applied to the schema.
318405 """
319- from dlt .destinations .sql_client import WithSqlClient
320-
321- if not isinstance (client , WithSqlClient ):
322- raise NotImplementedError
323-
324- def _diff_between_actual_and_dlt_schema (
325- table_name : str , actual_col_names : set [str ], disregard_dlt_columns : bool = True
326- ) -> TPartialTableSchema :
327- """Returns a partial table schema containing columns that exist in the dlt schema
328- but are missing from the actual table. Skips dlt internal columns by default.
329- """
330- col_schemas = schema .get_table_columns (table_name )
331-
332- # Map escaped -> original names (actual_col_names are escaped)
333- escaped_to_original = {
334- client .sql_client .escape_column_name (col , quote = False ): col
335- for col in col_schemas .keys ()
336- }
337- dropped_col_names = set (escaped_to_original .keys ()) - actual_col_names
338-
339- if not dropped_col_names :
340- return {}
341-
342- partial_table : TPartialTableSchema = {"name" : table_name , "columns" : {}}
343-
344- for esc_name in dropped_col_names :
345- orig_name = escaped_to_original [esc_name ]
346-
347- # Athena doesn't have dlt columns in actual columns. Don't drop them anyway.
348- if disregard_dlt_columns and orig_name in [C_DLT_ID , C_DLT_LOAD_ID ]:
349- continue
350-
351- col_schema = col_schemas [orig_name ]
352- if col_schema .get ("increment" ):
353- # We can warn within the for loop,
354- # since there's only one incremental field per table
355- logger .warning (
356- f"An incremental field { orig_name } is being removed from schema."
357- "You should unset the"
358- " incremental with `incremental=dlt.sources.incremental.EMPTY`"
359- )
360- partial_table ["columns" ][orig_name ] = col_schema
361-
362- return partial_table if partial_table ["columns" ] else {}
363-
364406 tables = table_names if table_names else schema .data_table_names ()
365407
366408 table_drops : TSchemaDrop = {} # includes entire tables to drop
@@ -379,10 +421,12 @@ def _diff_between_actual_and_dlt_schema(
379421 continue
380422
381423 # actual column schemas present ->
382- # we compare actual schemas with dlt ones ->
424+ # we compare actual column schemas with dlt ones ->
383425 # we take the difference as a partial table
384426 else :
385427 partial_table = _diff_between_actual_and_dlt_schema (
428+ client ,
429+ schema ,
386430 table_name ,
387431 set (actual_col_schemas .keys ()),
388432 )
0 commit comments