get_removed_table_columns in schema

anuunchin · anuunchin · commit 9471bc4d5a19 · 2025-11-25T11:52:35.000+01:00
diff --git a/dlt/common/schema/exceptions.py b/dlt/common/schema/exceptions.py
@@ -104,6 +104,32 @@ class SchemaCorruptedException(SchemaException):
     pass
 
 
+class SchemaIdentifierDestinationCollision(SchemaCorruptedException):
+    """Raised when multiple column names collide after transformation to destination format.
+
+    This should not happen under normal circumstances and indicates schema corruption.
+    """
+
+    def __init__(
+        self,
+        schema_name: str,
+        table_name: str,
+        colliding_columns: List[str],
+        destination_format: str,
+    ) -> None:
+        self.table_name = table_name
+        self.colliding_columns = colliding_columns
+        self.destination_format = destination_format
+
+        msg = (
+            f"Multiple columns in table `{table_name}` collide when transformed to"
+            f" destination format `{destination_format}`:"
+            f" {', '.join(repr(c) for c in colliding_columns)}."
+            " This should not happen under normal circumstances and indicates schema corruption."
+        )
+        super().__init__(schema_name, msg)
+
+
 class SchemaIdentifierNormalizationCollision(SchemaCorruptedException):
     def __init__(
         self,
diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py
@@ -15,6 +15,7 @@
     Set,
 )
 
+from dlt.common import logger
 from dlt.common.schema.migrations import migrate_schema
 from dlt.common.utils import extend_list_deduplicated, simple_repr, without_none
 from dlt.common.typing import (
@@ -521,6 +522,72 @@ def get_new_table_columns(
                 diff_c.append(c)
         return diff_c
 
+    def get_removed_table_columns(
+        self,
+        table_name: str,
+        existing_columns: TTableSchemaColumns,
+        escape_col_f: Callable[[str, bool, bool], str],
+        disregard_dlt_columns: bool = True,
+    ) -> List[TColumnSchema]:
+        """Gets columns to be removed from schema to match `existing_columns`.
+
+        This function identifies columns that exist in the dlt schema but are missing from the
+        destination table. It's used during schema synchronization to detect when columns have
+        been dropped from the destination and need to be removed from the dlt schema as well.
+
+        Column names are compared by transforming dlt schema names to destination format using
+        `escape_col_f`. `existing_columns` are expected to be in destination format (as they
+        appear in the destination's INFORMATION_SCHEMA).
+
+        dlt internal columns (_dlt_id, _dlt_load_id) can be optionally disregarded because
+        users rarely drop these columns manually, and if they did, dlt cannot recover from
+        this situation anyway.
+
+        Args:
+            table_name (str): Name of the table to analyze.
+            existing_columns (TTableSchemaColumns): Column schemas that actually exist in the
+                destination table, typically obtained from INFORMATION_SCHEMA queries. Column
+                names should be in destination format.
+            escape_col_f (Callable[[str, bool, bool], str]): Function to transform dlt column
+                names to destination format (e.g., 'id' -> 'ID' in Snowflake).
+            disregard_dlt_columns (bool): Whether to ignore apparent mismatches for dlt internal
+                columns (_dlt_id, _dlt_load_id). Defaults to True.
+
+        Returns:
+            List[TColumnSchema]: List of column schemas that exist in the dlt schema but are
+                missing from the destination table.
+        """
+        # Transform dlt schema column names to destination format (e.g., 'id' -> 'ID' in Snowflake)
+        # to match against actual_col_names from INFORMATION_SCHEMA
+        # Keys: destination format, Values: original dlt schema names
+        col_schemas = self.get_table_columns(table_name)
+        escaped_to_dlt = {escape_col_f(col, False, True): col for col in col_schemas.keys()}
+
+        if len(escaped_to_dlt) != len(col_schemas):
+            raise SchemaCorruptedException(
+                self.name,
+                f"Columns in table `{table_name}` have colliding names when transformed to"
+                " destination format. Original dlt schema column names:"
+                f" {list(col_schemas.keys())}. Destination format names:"
+                f" {list(escaped_to_dlt.keys())}. This should not happen under normal circumstances"
+                " and indicates schema corruption.",
+            )
+
+        diff_c: List[TColumnSchema] = []
+        for dest_name, name_in_dlt in escaped_to_dlt.items():
+            if disregard_dlt_columns and self.is_dlt_entity(name_in_dlt):
+                continue
+            if dest_name not in existing_columns:
+                col_schema = col_schemas[name_in_dlt]
+                if col_schema.get("incremental"):
+                    logger.warning(
+                        f"An incremental field {name_in_dlt} is being removed from schema."
+                        "You should unset the"
+                        " incremental with `incremental=dlt.sources.incremental.EMPTY`"
+                    )
+                diff_c.append(col_schema)
+        return diff_c
+
     def get_table(self, table_name: str) -> TTableSchema:
         try:
             return self._schema_tables[table_name]
diff --git a/dlt/destinations/utils.py b/dlt/destinations/utils.py
@@ -2,6 +2,8 @@
 
 from typing import Any, List, Dict, Type, Optional, Sequence, Tuple, cast, Iterable, Callable
 
+from sqlglot import column
+
 from dlt.common import logger
 from dlt.common.destination.capabilities import DestinationCapabilitiesContext
 from dlt.common.destination.typing import PreparedTableSchema
@@ -301,78 +303,6 @@ class WithTableReflectionAndSql(WithTableReflection, WithSqlClient):
     pass
 
 
-def get_removed_table_columns(
-    escape_col_f: Callable[[str, bool, bool], str],
-    schema: Schema,
-    table_name: str,
-    actual_col_names: set[str],
-    disregard_dlt_columns: bool = True,
-) -> TPartialTableSchema:
-    """Compares dlt schema with destination table schema and returns columns that appear to be missing.
-
-    This function identifies columns that exist in the dlt schema but are missing from the actual
-    destination table. It's used during schema synchronization to detect when columns may have
-    been dropped from the destination and need to be removed from the dlt schema as well.
-
-    However, dlt internal columns (_dlt_id, _dlt_load_id) are treated specially because
-    users rarely drop dlt internal columns manually, and if they did,
-    dlt cannot recover from this situation anyway.
-
-    Args:
-        client (WithTableReflectionAndSql): The destination client with table reflection capabilities.
-        schema (Schema): The dlt schema to compare against the destination.
-        table_name (str): Name of the table to analyze.
-        actual_col_names (set[str]): Column names that actually exist in the destination table,
-            typically obtained from INFORMATION_SCHEMA queries. For Athena,
-            this may not include dlt columns present in the underlying data files.
-        disregard_dlt_columns: Whether to ignore apparent mismatches for dlt internal
-            columns (_dlt_id, _dlt_load_id). Defaults to True to prevent incorrect
-            removal of essential dlt columns from the schema.
-
-    Returns:
-        TPartialTableSchema: Returns a partial table schema containing columns that exist in the dlt schema
-        but are missing from the actual table.
-
-    Example:
-        If dlt schema has [user_id, name, _dlt_id, _dlt_load_id] but destination
-        INFORMATION_SCHEMA only shows [user_id, name], this function would return
-        an empty dict (assuming disregard_dlt_columns=True) rather than suggesting
-        the dlt columns should be dropped.
-    """
-    col_schemas = schema.get_table_columns(table_name)
-
-    # Transform dlt schema column names to destination format (e.g., 'id' -> 'ID' in Snowflake)
-    # to match against actual_col_names from INFORMATION_SCHEMA
-    # Keys: destination format, Values: original dlt schema names
-    escaped_to_dlt = {escape_col_f(col, False, True): col for col in col_schemas.keys()}
-
-    possibly_dropped_col_names = set(escaped_to_dlt.keys()) - actual_col_names
-
-    if not possibly_dropped_col_names:
-        return {}
-
-    partial_table: TPartialTableSchema = {"name": table_name, "columns": {}}
-
-    for esc_name in possibly_dropped_col_names:
-        name_in_dlt = escaped_to_dlt[esc_name]
-
-        if disregard_dlt_columns and schema.is_dlt_entity(name_in_dlt):
-            continue
-
-        col_schema = col_schemas[name_in_dlt]
-        if col_schema.get("increment"):
-            # We can warn within the for loop,
-            # since there's only one incremental field per table
-            logger.warning(
-                f"An incremental field {name_in_dlt} is being removed from schema."
-                "You should unset the"
-                " incremental with `incremental=dlt.sources.incremental.EMPTY`"
-            )
-        partial_table["columns"][name_in_dlt] = col_schema
-
-    return partial_table if partial_table["columns"] else {}
-
-
 def sync_schema_from_storage_schema(
     get_storage_tables_f: Callable[[Iterable[str]], Iterable[tuple[str, dict[str, TColumnSchema]]]],
     escape_col_f: Callable[[str, bool, bool], str],
@@ -416,13 +346,17 @@ def sync_schema_from_storage_schema(
         # we compare actual column schemas with dlt ones ->
         # we take the difference as a partial table
         else:
-            partial_table = get_removed_table_columns(
-                escape_col_f,
-                schema,
-                table_name,
-                set(actual_col_schemas.keys()),
+            removed_columns = schema.get_removed_table_columns(
+                table_name=table_name,
+                existing_columns=actual_col_schemas,
+                escape_col_f=escape_col_f,
+                disregard_dlt_columns=True,
             )
-            if partial_table:
+            if removed_columns:
+                partial_table: TPartialTableSchema = {
+                    "name": table_name,
+                    "columns": {col["name"]: col for col in removed_columns},
+                }
                 column_drops[table_name] = partial_table
 
     # 2. For entire table drops, we make sure no orphaned tables remain