fixing Athena column name sanitation to actually work in all cases

brifordwylie · brifordwylie · commit 0720ae79e80e · 2025-09-09T16:09:10.000-06:00
diff --git a/examples/models/smiles_to_md_v1.py b/examples/models/smiles_to_md_v1.py
@@ -4,8 +4,8 @@
 from workbench.api import FeatureSet, ModelType, Model
 from workbench.utils.model_utils import get_custom_script_path
 
-fs_name = "aqsol_features"
-# fs_name = "solubility_featurized_class_0_fs"
+# fs_name = "aqsol_features"
+fs_name = "solubility_featurized_class_0_fs"
 
 
 script_path = get_custom_script_path("chem_info", "molecular_descriptors.py")
diff --git a/src/workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py b/src/workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py
@@ -75,7 +75,7 @@ def output_fn(output_df, accept_type):
 # Prediction function
 def predict_fn(df, model):
 
-    # Standardize the molecule (remove salts) and then compute descriptors
-    df = standardize(df)
+    # Standardize the molecule (extract salts) and then compute descriptors
+    df = standardize(df, extract_salts=True)
     df = compute_descriptors(df)
     return df
diff --git a/src/workbench/utils/chem_utils/mol_descriptors.py b/src/workbench/utils/chem_utils/mol_descriptors.py
@@ -344,21 +344,16 @@ def compute_descriptors(df: pd.DataFrame, include_mordred: bool = True, include_
     stereo_count = len(stereo_df.columns) if include_stereo else 0
     logger.info(f"Descriptor breakdown: RDKit={rdkit_count}, Mordred={mordred_count}, Stereo={stereo_count}")
 
-    # Note: The results are often stored in an AWS Athena table.
-    # Athena has restrictions on column names:
-    # - Must be lowercase
-    # - No special characters except underscore
-    # - No spaces
-    # https://docs.aws.amazon.com/athena/latest/ug/tables-databases-columns-names.html
-    safe_columns = [re.sub(r"_+", "_", re.sub(r"[^a-z0-9_]", "_", col.lower())) for col in result.columns]
-
-    # Check for duplicates before dropping
-    if len(safe_columns) != len(set(safe_columns)):
-        from collections import Counter
-
-        duplicates = {col for col, count in Counter(safe_columns).items() if count > 1}
-        logger.warning(f"Duplicate column names after sanitization: {duplicates} - dropping duplicates!")
-        result.columns = safe_columns
+    # Sanitize column names for AWS Athena compatibility
+    # - Must be lowercase, no special characters except underscore, no spaces
+    result.columns = [
+        re.sub(r"_+", "_", re.sub(r"[^a-z0-9_]", "_", col.lower()))
+        for col in result.columns
+    ]
+
+    # Drop duplicate columns if any exist after sanitization
+    if result.columns.duplicated().any():
+        logger.warning(f"Duplicate column names after sanitization - dropping duplicates!")
         result = result.loc[:, ~result.columns.duplicated()]
 
     return result
diff --git a/src/workbench/utils/pandas_utils.py b/src/workbench/utils/pandas_utils.py
@@ -152,17 +152,19 @@ def compare_dataframes(df1: pd.DataFrame, df2: pd.DataFrame, display_columns: li
 
     # Check for differences in common columns
     for column in common_columns:
-        if pd.api.types.is_string_dtype(df1[column]) or pd.api.types.is_string_dtype(df2[column]):
+        if pd.api.types.is_string_dtype(df1[column]) and pd.api.types.is_string_dtype(df2[column]):
             # String comparison with NaNs treated as equal
             differences = ~(df1[column].fillna("") == df2[column].fillna(""))
         elif pd.api.types.is_float_dtype(df1[column]) or pd.api.types.is_float_dtype(df2[column]):
             # Float comparison within epsilon with NaNs treated as equal
             differences = ~((df1[column] - df2[column]).abs() <= epsilon) & ~(
-                pd.isna(df1[column]) & pd.isna(df2[column])
+                    pd.isna(df1[column]) & pd.isna(df2[column])
             )
         else:
-            # Other types (e.g., int) with NaNs treated as equal
-            differences = ~(df1[column].fillna(0) == df2[column].fillna(0))
+            # Other types (int, Int64, etc.) - compare with NaNs treated as equal
+            differences = (df1[column] != df2[column]) & ~(
+                    pd.isna(df1[column]) & pd.isna(df2[column])
+            )
 
         # If differences exist, display them
         if differences.any():