Facets now read from esmvalcore intake configuration file instead of passed to to_iris call by user.

charles-turner-1 · charles-turner-1 · commit 5961ec0bfef0 · 2025-05-09T15:34:18.000+08:00
diff --git a/intake_esm/_search.py b/intake_esm/_search.py
@@ -45,7 +45,10 @@ def search(
         column_has_iterables = column in columns_with_iterables
         for value in values:
             if column_has_iterables:
-                mask = df[column].str.contains(value, regex=False)
+                try:
+                    mask = df[column].str.contains(value, regex=False)
+                except AttributeError:
+                    mask = df[column].apply(tuple).str.contains(value, regex=False)
             elif column_is_stringtype and is_pattern(value):
                 mask = df[column].str.contains(value, regex=True, case=True, flags=0)
             elif pd.isna(value):
diff --git a/intake_esm/cat.py b/intake_esm/cat.py
@@ -109,6 +109,7 @@ class ESMCatalogModel(pydantic.BaseModel):
     id: str = ''
     catalog_dict: list[dict] | None = None
     catalog_file: pydantic.StrictStr | None = None
+    fhandle: pydantic.StrictStr | None = None
     description: pydantic.StrictStr | None = None
     title: pydantic.StrictStr | None = None
     last_updated: datetime.datetime | datetime.date | None = None
@@ -262,6 +263,7 @@ def load(
                     df=pl.DataFrame(cat.catalog_dict).to_pandas(),
                 )
 
+            cat.fhandle = json_file
             cat._cast_agg_columns_with_iterables()
             return cat
 
diff --git a/intake_esm/core.py b/intake_esm/core.py
@@ -8,7 +8,6 @@
 if typing.TYPE_CHECKING:
     import esmvalcore
     import esmvalcore.dataset
-    from esmvalcore.typing import FacetValue
 
 import dask
 import packaging.version
@@ -847,7 +846,7 @@ def to_dask(self, **kwargs) -> xr.Dataset:
 
     def to_iris(
         self,
-        facet_map: dict['FacetValue', str],
+        search: dict[str, str],
         cmorizer: typing.Any | None = None,
         **kwargs,
     ) -> 'esmvalcore.dataset.Dataset':
@@ -889,10 +888,22 @@ def to_iris(
             raise ValueError(
                 f'Expected exactly one dataset. Received {len(self)} datasets. Please refine your search.'
             )
-        else:
-            from esmvalcore.dataset import Dataset
 
-        ds = Dataset(**facet_map)
+        # Use esmvalcore to load the intake configuration & work out how we
+        # need to map our facets
+
+        from esmvalcore.config._intake import load_intake_config
+        from esmvalcore.dataset import Dataset
+
+        facet_map, project = _read_facets(load_intake_config(), self.esmcat.fhandle)
+
+        facets = {k: search.get(v) for k, v in facet_map.items()}
+        facets = {k: v for k, v in facets.items() if v is not None}
+
+        facets.pop('version', None)  # If there's a version, chuck it
+        facets['project'] = project
+
+        ds = Dataset(**facets)
 
         ds.files = self.unique().path
         ds.augment_facets()
@@ -926,3 +937,47 @@ def _get_threaded(threaded: bool | None) -> bool:
             ) from e
 
     return threaded
+
+
+def _read_facets(
+    cfg: dict,
+    fhandle: str | None,
+    project: str | None = None,
+) -> tuple[dict[str, typing.Any], str]:
+    """
+    Extract facet mapping from ESMValCore configuration for a given catalog file handle.
+
+    Recursively traverses the ESMValCore configuration structure to find the
+    facet mapping that corresponds to the specified file handle.
+
+    Parameters
+    ----------
+    cfg : dict
+        The ESMValCore intake configuration dictionary.
+    fhandle : str
+        The file handle/path of the intake-esm catalog to match.
+    project : str, optional
+        The current project name in the configuration hierarchy.
+
+    Returns
+    -------
+    tuple
+        A tuple containing:
+        - dict: Facet mapping between ESMValCore facets and catalog columns
+        - str: The project name associated with the catalog file
+    """
+    if fhandle is None:
+        raise ValueError('Unable to ascertain facets without valid file handle.')
+
+    for _project, val in cfg.items():
+        if not (isinstance(val, list)):
+            return _read_facets(val, fhandle, project or _project)
+        for facet_info in val:
+            file, facets = facet_info.get('file'), facet_info.get('facets')
+            if file == fhandle:
+                return facets, project  # type: ignore[return-value]
+    else:
+        raise ValueError(
+            f'No facets found for {fhandle} in the config file. '
+            'Please check the config file and ensure it is valid.'
+        )
diff --git a/tests/sample-catalogs/access-columns-with-iterables.csv.gz b/tests/sample-catalogs/access-columns-with-iterables.csv.gz
diff --git a/tests/sample-catalogs/access-columns-with-iterables.json b/tests/sample-catalogs/access-columns-with-iterables.json
@@ -0,0 +1,81 @@
+{
+  "esmcat_version": "0.0.1",
+  "attributes": [
+    {
+      "column_name": "filename",
+      "vocabulary": ""
+    },
+    {
+      "column_name": "file_id",
+      "vocabulary": ""
+    },
+    {
+      "column_name": "path",
+      "vocabulary": ""
+    },
+    {
+      "column_name": "filename_timestamp",
+      "vocabulary": ""
+    },
+    {
+      "column_name": "frequency",
+      "vocabulary": ""
+    },
+    {
+      "column_name": "start_date",
+      "vocabulary": ""
+    },
+    {
+      "column_name": "end_date",
+      "vocabulary": ""
+    },
+    {
+      "column_name": "variable",
+      "vocabulary": ""
+    },
+    {
+      "column_name": "variable_long_name",
+      "vocabulary": ""
+    },
+    {
+      "column_name": "variable_standard_name",
+      "vocabulary": ""
+    },
+    {
+      "column_name": "variable_cell_methods",
+      "vocabulary": ""
+    },
+    {
+      "column_name": "variable_units",
+      "vocabulary": ""
+    },
+    {
+      "column_name": "realm",
+      "vocabulary": ""
+    }
+  ],
+  "assets": {
+    "column_name": "path",
+    "format": "netcdf",
+    "format_column_name": null
+  },
+  "aggregation_control": {
+    "variable_column_name": "variable",
+    "groupby_attrs": ["file_id", "frequency"],
+    "aggregations": [
+      {
+        "type": "join_existing",
+        "attribute_name": "start_date",
+        "options": {
+          "dim": "time",
+          "combine": "by_coords"
+        }
+      }
+    ]
+  },
+  "id": "01deg_jra55v13_ryf9091",
+  "description": "0.1 degree ACCESS-OM2 global model configuration with JRA55-do v1.3 RYF9091  repeat year forcing (May 1990 to Apr 1991)",
+  "title": null,
+  "last_updated": "2025-03-04T01:25:35Z",
+  "catalog_file": "access-columns-with-iterables.csv.gz"
+}
diff --git a/tests/test_core.py b/tests/test_core.py
@@ -19,6 +19,7 @@
 import intake_esm
 
 from .utils import (
+    access_columns_with_iterables_cat,
     catalog_dict_records,
     cdf_cat_sample_cesmle,
     cdf_cat_sample_cmip5,
@@ -201,6 +202,18 @@ def test_catalog_search(path, query, expected_size):
     assert len(new_cat) == expected_size
 
 
+@pytest.mark.parametrize(
+    'path, columns_with_iterables, query, expected_size',
+    [
+        (access_columns_with_iterables_cat, ['variable'], {'variable': ['aice_m']}, 1),
+    ],
+)
+def test_catalog_search_columns_with_iterables(path, columns_with_iterables, query, expected_size):
+    cat = intake.open_esm_datastore(path, columns_with_iterables=columns_with_iterables)
+    new_cat = cat.search(**query)
+    assert len(new_cat) == expected_size
+
+
 def test_catalog_with_registry_search():
     cat = intake.open_esm_datastore(zarr_cat_aws_cesm, registry=registry)
     new_cat = cat.search(variable='FOO')
@@ -648,7 +661,14 @@ def test_to_iris_unavailable():
     )
     with pytest.raises(ImportError, match=r'`to_iris\(\)` requires the esmvalcore package'):
         _ = cat_sub.to_iris(
-            facet_map={},
+            search=dict(
+                variable_id=['pr'],
+                experiment_id='ssp370',
+                activity_id='AerChemMIP',
+                source_id='BCC-ESM1',
+                table_id='Amon',
+                grid_label='gn',
+            ),
             xarray_open_kwargs={
                 'consolidated': True,
                 'backend_kwargs': {'storage_options': {'token': 'anon'}},
diff --git a/tests/utils.py b/tests/utils.py
@@ -16,6 +16,9 @@
     'https://raw.githubusercontent.com/NCAR/cesm-lens-aws/master/intake-catalogs/aws-cesm1-le.json'
 )
 mixed_cat_sample_cmip6 = os.path.join(here, 'sample-catalogs/cmip6-bcc-mixed-formats.json')
+access_columns_with_iterables_cat = os.path.join(
+    here, 'sample-catalogs/access-columns-with-iterables.json'
+)
 
 
 sample_df = pd.DataFrame(

Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,9 @@`
`16`	`16`	`'https://raw.githubusercontent.com/NCAR/cesm-lens-aws/master/intake-catalogs/aws-cesm1-le.json'`
`17`	`17`	`)`
`18`	`18`	`mixed_cat_sample_cmip6 = os.path.join(here, 'sample-catalogs/cmip6-bcc-mixed-formats.json')`
	`19`	`+access_columns_with_iterables_cat = os.path.join(`
	`20`	`+ here, 'sample-catalogs/access-columns-with-iterables.json'`
	`21`	`+)`
`19`	`22`
`20`	`23`
`21`	`24`	`sample_df = pd.DataFrame(`