-
-
Notifications
You must be signed in to change notification settings - Fork 43
feat: Add Ice Chunk support for high-performance cloud data access #292
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 14 commits
09f29f1
7cf48cc
58eb2ac
90bd8aa
5773715
113b9ce
e6cb9d5
dd348c7
e39072b
f7fc65b
6a6d009
2f19b3f
a96b412
8fac972
fe61946
c521625
4fb5b69
ada0181
311e54f
d883142
a550ccb
e6eabe4
bea29ff
7084005
635b476
a41b7f2
4051471
93bf294
199d705
37cfa22
fddd143
fc081b5
8a88ad8
f002487
2248ec7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
devsjc marked this conversation as resolved.
Show resolved
Hide resolved
|
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -1,59 +1,198 @@ | ||||||
| """Satellite loader.""" | ||||||
| import numpy as np | ||||||
|
|
||||||
| import logging | ||||||
| import os | ||||||
| import re | ||||||
| from typing import List, Optional | ||||||
|
|
||||||
| import dask | ||||||
| import icechunk | ||||||
| import xarray as xr | ||||||
| from xarray_tensorstore import open_zarr | ||||||
| from ocf_data_sampler.load.open_tensorstore_zarrs import open_zarrs | ||||||
| from contextlib import contextmanager | ||||||
|
|
||||||
| from ocf_data_sampler.load.utils import ( | ||||||
| check_time_unique_increasing, | ||||||
| get_xr_data_array_from_xr_dataset, | ||||||
| make_spatial_coords_increasing, | ||||||
| ) | ||||||
|
|
||||||
| from .open_tensorstore_zarrs import open_zarrs | ||||||
|
|
||||||
| logger = logging.getLogger(__name__) | ||||||
|
|
||||||
| def open_sat_data(zarr_path: str | list[str]) -> xr.DataArray: | ||||||
| """Lazily opens the zarr store and validates data types. | ||||||
| # Optimal values from research, now hardcoded as per Sol's feedback. | ||||||
devsjc marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
| OPTIMAL_BLOCK_SIZE_MB = 64 | ||||||
| OPTIMAL_THREADS = 2 | ||||||
devsjc marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
|
|
||||||
| Args: | ||||||
| zarr_path: Cloud URL or local path pattern, or list of these. If GCS URL, | ||||||
| it must start with 'gs://' | ||||||
| """ | ||||||
| # Open the data | ||||||
| def open_sat_data(zarr_path: str | list[str], channels: list[str] | None = None) -> xr.DataArray: | ||||||
|
||||||
| """Lazily opens the zarr store and validates data types.""" | ||||||
devsjc marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
|
|
||||||
| if isinstance(zarr_path, list | tuple): | ||||||
| ds = open_zarrs(zarr_path, concat_dim="time") | ||||||
| else: | ||||||
| ds = open_zarr(zarr_path) | ||||||
| # Parse path components using Sol's regex approach | ||||||
devsjc marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
| path_info = _parse_zarr_path(zarr_path) | ||||||
|
|
||||||
| # Sol's requested match/case pattern for path routing | ||||||
devsjc marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
| match path_info: | ||||||
| # Updated case to handle local icechunk paths correctly | ||||||
devsjc marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
| case {"protocol": protocol, "bucket": bucket, "prefix": prefix, "sha1": sha1} if prefix.endswith(".icechunk"): | ||||||
| # Single case for both local and cloud Ice Chunk | ||||||
|
||||||
| # Single case for both local and cloud Ice Chunk |
devsjc marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| # Raise error on unhandled path |
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This can go after [1]
devsjc marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
devsjc marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| "y_geostationary": "f", # floating | |
| "y_geostationary": "f", |
devsjc marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| # Store original values |
devsjc marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| # Sol's recommended regex pattern - handles optional protocol and wildcards |
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| # Validation checks moved from match block |
devsjc marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
devsjc marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| with _setup_optimal_environment(): # Use context manager | |
| with _setup_optimal_environment(): |
devsjc marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| # Get repo from storage (single try/catch) |
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| # CORRECT - uses proper Ice Chunk API |
devsjc marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| # Open the dataset from the Ice Chunk session store |
Uh oh!
There was an error while loading. Please reload this page.