Better docstrings, var names

anuunchin · anuunchin · commit b1c3f3f9a64f · 2025-09-05T09:22:24.000+02:00
diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py
@@ -82,7 +82,6 @@
 from dlt.destinations.utils import (
     verify_schema_merge_disposition,
     verify_schema_replace_disposition,
-    update_dlt_schema,
 )
 
 CURRENT_VERSION: int = 2
@@ -474,8 +473,12 @@ def drop_tables(self, *tables: str, delete_schema: bool = True) -> None:
     def get_storage_tables(
         self, table_names: Iterable[str]
     ) -> Iterable[Tuple[str, TTableSchemaColumns]]:
-        """Yields tables that have files in storage, returns columns from files in storage for regular delta/iceberg tables,
-        or from schema for regular tables without table format"""
+        """Yield (table_name, column_schemas) pairs for tables that have files in storage.
+
+        For Delta and Iceberg tables, the columns present in the actual table metadata
+        are returned. For tables using regular file formats, the column schemas come from the
+        dlt schema instead, since their real schema cannot be reflected directly.
+        """
         for table_name in table_names:
             table_dir = self.get_table_dir(table_name)
             if (
@@ -517,6 +520,11 @@ def get_storage_tables(
                         yield (table_name, col_schemas)
 
                     else:
+                        logger.warning(
+                            f"Table '{table_name}' uses regular file format and does not support"
+                            " true schema reflection. Returning column schemas from the dlt"
+                            " schema. "
+                        )
                         yield (table_name, self.schema.get_table_columns(table_name))
 
                 else:
diff --git a/dlt/destinations/utils.py b/dlt/destinations/utils.py
@@ -28,6 +28,7 @@
 )
 
 from dlt.destinations.exceptions import DatabaseTransientException
+from dlt.destinations.sql_client import WithSqlClient
 from dlt.extract import DltResource, resource as make_resource, DltSource
 
 RE_DATA_TYPE = re.compile(r"([A-Z]+)\((\d+)(?:,\s?(\d+))?\)")
@@ -299,68 +300,109 @@ def get_deterministic_temp_table_name(table_name: str, op: str) -> str:
     return f"{op_name}_{NamingConvention._compute_tag(op_name, 0.001)}"
 
 
+class WithTableReflectionAndSql(WithTableReflection, WithSqlClient):
+    pass
+
+
+def _diff_between_actual_and_dlt_schema(
+    client: WithTableReflectionAndSql,
+    schema: Schema,
+    table_name: str,
+    actual_col_names: set[str],
+    disregard_dlt_columns: bool = True,
+) -> TPartialTableSchema:
+    """Compares dlt schema with destination table schema and returns columns that appear to be missing.
+
+    This function identifies columns that exist in the dlt schema but are missing from the actual
+    destination table. It's used during schema synchronization to detect when columns may have
+    been dropped from the destination and need to be removed from the dlt schema as well.
+
+    However, dlt internal columns (_dlt_id, _dlt_load_id) are treated specially because:
+
+    1. Users rarely drop dlt internal columns manually, and if they did,
+       dlt cannot recover from this situation anyway.
+
+    2. Athena has a constraint where dlt columns exist in the data but not in the table metadata:
+
+       - Athena external tables have fixed schemas defined at CREATE TABLE time
+       - These columns exist in the actual data files but don't appear in INFORMATION_SCHEMA
+       - This causes false positives where dlt columns appear "missing" when they're not
+
+    Args:
+        client (WithTableReflectionAndSql): The destination client with table reflection capabilities.
+        schema (Schema): The dlt schema to compare against the destination.
+        table_name (str): Name of the table to analyze.
+        actual_col_names (set[str]): Column names that actually exist in the destination table,
+            typically obtained from INFORMATION_SCHEMA queries. For Athena,
+            this may not include dlt columns present in the underlying data files.
+        disregard_dlt_columns: Whether to ignore apparent mismatches for dlt internal
+            columns (_dlt_id, _dlt_load_id). Defaults to True to prevent incorrect
+            removal of essential dlt columns from the schema.
+
+    Returns:
+        TPartialTableSchema: Returns a partial table schema containing columns that exist in the dlt schema
+        but are missing from the actual table.
+
+    Example:
+        If dlt schema has [user_id, name, _dlt_id, _dlt_load_id] but destination
+        INFORMATION_SCHEMA only shows [user_id, name], this function would return
+        an empty dict (assuming disregard_dlt_columns=True) rather than suggesting
+        the dlt columns should be dropped.
+    """
+    col_schemas = schema.get_table_columns(table_name)
+
+    # Map escaped (like actual_col_names) -> original names (what appears in the dlt schema)
+    escaped_to_dlt = {
+        client.sql_client.escape_column_name(col, quote=False): col for col in col_schemas.keys()
+    }
+
+    possibly_dropped_col_names = set(escaped_to_dlt.keys()) - actual_col_names
+
+    if not possibly_dropped_col_names:
+        return {}
+
+    partial_table: TPartialTableSchema = {"name": table_name, "columns": {}}
+
+    for esc_name in possibly_dropped_col_names:
+        name_in_dlt = escaped_to_dlt[esc_name]
+
+        if disregard_dlt_columns and name_in_dlt in [C_DLT_ID, C_DLT_LOAD_ID]:
+            continue
+
+        col_schema = col_schemas[name_in_dlt]
+        if col_schema.get("increment"):
+            # We can warn within the for loop,
+            # since there's only one incremental field per table
+            logger.warning(
+                f"An incremental field {name_in_dlt} is being removed from schema."
+                "You should unset the"
+                " incremental with `incremental=dlt.sources.incremental.EMPTY`"
+            )
+        partial_table["columns"][name_in_dlt] = col_schema
+
+    return partial_table if partial_table["columns"] else {}
+
+
 def update_dlt_schema(
-    client: WithTableReflection,
+    client: WithTableReflectionAndSql,
     schema: Schema,
     table_names: Iterable[str] = None,
     dry_run: bool = False,
 ) -> Optional[TSchemaDrop]:
-    """Updates schema to the storage.
+    """Updates the dlt schema from destination.
 
     Compare the schema we think we should have with what actually exists in the destination,
     and drop any tables and/or columns that disappeared.
 
     Args:
+        client (WithTableReflectionAndSql): The destination client with table reflection capabilities.
+        schema (Schema): The dlt schema to compare against the destination.
         table_names (Iterable[str], optional): Check only listed tables. Defaults to None and checks all tables.
+        dry_run (bool, optional): Whether to actually update the dlt schema. Defaults to False.
 
     Returns:
-        Optional[TSchemaTables]: Returns an update that was applied to the schema.
+        Optional[TSchemaDrop]: Returns the update that was applied to the schema.
     """
-    from dlt.destinations.sql_client import WithSqlClient
-
-    if not isinstance(client, WithSqlClient):
-        raise NotImplementedError
-
-    def _diff_between_actual_and_dlt_schema(
-        table_name: str, actual_col_names: set[str], disregard_dlt_columns: bool = True
-    ) -> TPartialTableSchema:
-        """Returns a partial table schema containing columns that exist in the dlt schema
-        but are missing from the actual table. Skips dlt internal columns by default.
-        """
-        col_schemas = schema.get_table_columns(table_name)
-
-        # Map escaped -> original names (actual_col_names are escaped)
-        escaped_to_original = {
-            client.sql_client.escape_column_name(col, quote=False): col
-            for col in col_schemas.keys()
-        }
-        dropped_col_names = set(escaped_to_original.keys()) - actual_col_names
-
-        if not dropped_col_names:
-            return {}
-
-        partial_table: TPartialTableSchema = {"name": table_name, "columns": {}}
-
-        for esc_name in dropped_col_names:
-            orig_name = escaped_to_original[esc_name]
-
-            # Athena doesn't have dlt columns in actual columns. Don't drop them anyway.
-            if disregard_dlt_columns and orig_name in [C_DLT_ID, C_DLT_LOAD_ID]:
-                continue
-
-            col_schema = col_schemas[orig_name]
-            if col_schema.get("increment"):
-                # We can warn within the for loop,
-                # since there's only one incremental field per table
-                logger.warning(
-                    f"An incremental field {orig_name} is being removed from schema."
-                    "You should unset the"
-                    " incremental with `incremental=dlt.sources.incremental.EMPTY`"
-                )
-            partial_table["columns"][orig_name] = col_schema
-
-        return partial_table if partial_table["columns"] else {}
-
     tables = table_names if table_names else schema.data_table_names()
 
     table_drops: TSchemaDrop = {}  # includes entire tables to drop
@@ -379,10 +421,12 @@ def _diff_between_actual_and_dlt_schema(
             continue
 
         # actual column schemas present ->
-        # we compare actual schemas with dlt ones ->
+        # we compare actual column schemas with dlt ones ->
         # we take the difference as a partial table
         else:
             partial_table = _diff_between_actual_and_dlt_schema(
+                client,
+                schema,
                 table_name,
                 set(actual_col_schemas.keys()),
             )
diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py
@@ -112,7 +112,7 @@
 from dlt.destinations.sql_client import SqlClientBase, WithSqlClient
 from dlt.destinations.fs_client import FSClientBase
 from dlt.destinations.dataset import get_destination_clients
-from dlt.destinations.utils import update_dlt_schema
+from dlt.destinations.utils import update_dlt_schema, WithTableReflectionAndSql
 
 from dlt.load.configuration import LoaderConfiguration
 from dlt.load import Load
@@ -1066,9 +1066,12 @@ def sync_schema_from_destination(
         with self._get_destination_clients(schema)[0] as client:
             if not client.is_storage_initialized():
                 raise DestinationUndefinedEntity()
-            if isinstance(client, WithTableReflection):
+            if isinstance(client, WithTableReflection) and isinstance(client, WithSqlClient):
                 return update_dlt_schema(
-                    client=client, schema=schema, table_names=table_names, dry_run=dry_run
+                    client=cast(WithTableReflectionAndSql, client),
+                    schema=schema,
+                    table_names=table_names,
+                    dry_run=dry_run,
                 )
             else:
                 raise DestinationTableReflectionNotSupported(self._destination.destination_name)
diff --git a/tests/load/pipeline/test_sync_dlt_schema.py b/tests/load/pipeline/test_sync_dlt_schema.py
@@ -1,4 +1,5 @@
 import json
+import os
 from typing import cast
 
 import pytest
@@ -11,7 +12,9 @@
 
 from dlt.common import logger
 from tests.pipeline.utils import assert_load_info
+from tests.utils import TEST_STORAGE_ROOT
 from tests.load.utils import (
+    FILE_BUCKET,
     destinations_configs,
     DestinationTestConfiguration,
 )
@@ -186,6 +189,12 @@ def test_sync_dlt_schema(
         _drop_column_in_sql(pipeline, destination_config, "my_table", "age")
         _drop_table_in_sql(pipeline, destination_config, "my_last_table")
 
+    # Sanity check that the tables are still all in the dlt schema
+    assert all(
+        table_name in pipeline.default_schema.tables
+        for table_name in ["my_table", "my_other_table", "my_last_table"]
+    )
+
     # Make sure the warning about orphaned tables is emitted
     logger_spy = mocker.spy(logger, "warning")
 
@@ -227,3 +236,40 @@ def test_sync_dlt_schema(
 
     assert "my_last_table" not in pipeline.default_schema.tables
     assert "my_last_table__children" not in pipeline.default_schema.tables
+
+
+@pytest.mark.parametrize(
+    "destination_config",
+    destinations_configs(
+        local_filesystem_configs=True,
+    ),
+    ids=lambda x: x.name,
+)
+def test_regular_filesystem_tables(
+    destination_config: DestinationTestConfiguration, mocker: MockerFixture
+) -> None:
+    pipeline = destination_config.setup_pipeline(pipeline_name=f"pipe_{uniq_id()}")
+
+    # 1. Check whether dropping an entire table without a table format is handled correctly
+    assert_load_info(pipeline.run(my_resource(), **destination_config.run_kwargs))
+
+    _drop_table_in_filesystem(pipeline, destination_config, "my_table")
+
+    # Sanity check that the table is still in the dlt schema
+    assert "my_table" in pipeline.default_schema.tables
+
+    # An entire table drop should not emit warnings regardless of table format
+    logger_spy = mocker.spy(logger, "warning")
+    schema_drops = pipeline.sync_schema_from_destination()
+    logger_spy.assert_not_called()
+
+    # The table should be in the schema drops and removed from schema itself
+    assert len(schema_drops) == 1
+    assert "my_table" in schema_drops
+    assert "my_table" not in pipeline.default_schema.tables
+
+    # 2. When a column is dropped from a table without an open table format,
+    # we emit a warning, but don't update the dlt schema
+    assert_load_info(pipeline.run(my_resource(), **destination_config.run_kwargs))
+
+    # TODO: Finish this.