Skip to content

Commit 8f5dc98

Browse files
committed
feat: Integrate Ice Chunk for optimized satellite loading
This commit introduces Ice Chunk support for significantly faster satellite data loading, achieving up to 1.9x performance improvement over plain Zarr. Key changes: - Unified satellite data loading using a single function that intelligently dispatches to either standard Zarr or Ice Chunk based on the path. - Simplified configuration by removing Ice Chunk-specific parameters and using a single field. - Optimized Ice Chunk conversion process using the library. - Added comprehensive benchmarking scripts to compare Ice Chunk and plain Zarr performance. The benchmark results show a significant performance boost with Ice Chunk. This enhancement reduces data loading times and improves overall efficiency, bringing the OCF Data Sampler closer to production readiness. The main files that were changed are: - ocf_data_sampler/config/model.py - ocf_data_sampler/load/satellite.py - ocf_data_sampler/load/load_dataset.py - scripts/full_dataset_icechunk_conversion.py - scripts/production_benchmark_comparison.py
1 parent ae6399f commit 8f5dc98

12 files changed

+353
-805
lines changed

ocf_data_sampler/config/model.py

Lines changed: 2 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -216,92 +216,17 @@ def channel_stds(self) -> dict[str, float]:
216216
class Satellite(TimeWindowMixin, DropoutMixin, SpatialWindowMixin, NormalisationConstantsMixin):
217217
"""Satellite configuration model with Ice Chunk support."""
218218

219-
220-
zarr_path: str | tuple[str] | list[str] | None = Field(
221-
None,
219+
zarr_path: str | tuple[str] | list[str] = Field(
220+
...,
222221
description="Absolute or relative zarr filepath(s). Prefix with a protocol like s3:// "
223222
"to read from alternative filesystems.",
224223
)
225224

226-
227-
# NEW: Ice Chunk support
228-
icechunk_path: str | None = Field(
229-
None,
230-
description="Remote Ice Chunk dataset path for cloud-native streaming. "
231-
"When provided, this will be used instead of zarr_path with 103+ MB/s optimizations.",
232-
)
233-
234-
235-
bucket_name: str = Field(
236-
"gsoc-dakshbir",
237-
description="GCS bucket name for Ice Chunk datasets",
238-
)
239-
240-
241-
# NEW: True Ice Chunk settings
242-
use_true_icechunk: bool = Field(
243-
False,
244-
description="Whether to use the true Ice Chunk API for versioning and transactions.",
245-
)
246-
247-
248-
icechunk_branch: str | None = Field(
249-
"main",
250-
description="The Ice Chunk branch to use (e.g., 'main').",
251-
)
252-
253-
254-
icechunk_commit: str | None = Field(
255-
None,
256-
description="The specific Ice Chunk commit ID to use. If provided, it overrides the branch.",
257-
)
258-
259-
260-
# NEW: Optimization parameters (from your research)
261-
use_optimized_streaming: bool = Field(
262-
True,
263-
description="Enable 103+ MB/s streaming optimizations for Ice Chunk",
264-
)
265-
266-
267-
optimal_time_steps: int = Field(
268-
6,
269-
description="Optimal time steps per chunk for streaming. "
270-
"Default is 6, which is optimal for 11-channel satellite data.",
271-
)
272-
273-
274-
optimal_block_size_mb: int = Field(
275-
64,
276-
description="Optimal GCS block size in MB (default: 64 from optimization research)",
277-
)
278-
279-
280225
channels: list[str] = Field(
281226
...,
282227
description="the satellite channels that are used",
283228
)
284229

285-
286-
@model_validator(mode="after")
287-
def validate_data_source(self) -> "Satellite":
288-
"""Validate that either zarr_path or icechunk_path is provided."""
289-
if not self.zarr_path and not self.icechunk_path:
290-
raise ValueError(
291-
"Either 'zarr_path' or 'icechunk_path' must be provided for satellite data"
292-
)
293-
294-
if self.zarr_path and self.icechunk_path:
295-
# If both provided, prefer icechunk_path and warn
296-
import warnings
297-
warnings.warn(
298-
"Both zarr_path and icechunk_path provided. Using icechunk_path for optimized streaming.",
299-
UserWarning
300-
)
301-
302-
return self
303-
304-
305230
@model_validator(mode="after")
306231
def check_all_channel_have_normalisation_constants(self) -> "Satellite":
307232
"""Check that all the channels have normalisation constants."""
@@ -315,8 +240,6 @@ def check_all_channel_have_normalisation_constants(self) -> "Satellite":
315240
return self
316241

317242

318-
319-
320243
class NWP(TimeWindowMixin, DropoutMixin, SpatialWindowMixin, NormalisationConstantsMixin):
321244
"""NWP configuration model."""
322245

@@ -484,5 +407,3 @@ class Configuration(Base):
484407

485408
general: General = General()
486409
input_data: InputData = InputData()
487-
488-

ocf_data_sampler/load/icechunk_optimized.py

Lines changed: 0 additions & 229 deletions
This file was deleted.

0 commit comments

Comments
 (0)