Set correct dtype when loading and saving hdf5 (#2014)

Marc-Jindra · ClaudiaComito · web-flow · commit 678cd47a551d · 2025-11-14T13:21:31.000+01:00
* fixed load_hdf5

* fixed save_hdf5

* fixed different behavior in tests

* test torch dtype for save_hdf5

---------

Co-authored-by: Claudia Comito &lt;39374113+ClaudiaComito@users.noreply.github.com&gt;
diff --git a/heat/core/io.py b/heat/core/io.py
@@ -518,7 +518,7 @@ def supports_hdf5() -> bool:
     def load_hdf5(
         path: str,
         dataset: str,
-        dtype: datatype = types.float32,
+        dtype: Optional[datatype] = None,
         slices: Optional[Tuple[Optional[slice], ...]] = None,
         split: Optional[int] = None,
         device: Optional[str] = None,
@@ -534,7 +534,7 @@ def load_hdf5(
         dataset : str
             Name of the dataset to be read.
         dtype : datatype, optional
-            Data type of the resulting array.
+            Data type of the resulting array, defaults to the loaded datasets type.
         slices : tuple of slice objects, optional
             Load only the specified slices of the dataset.
         split : int or None, optional
@@ -626,8 +626,6 @@ def load_hdf5(
         elif split is not None and not isinstance(split, int):
             raise TypeError(f"split must be None or int, not {type(split)}")
 
-        # infer the type and communicator for the loaded array
-        dtype = types.canonical_heat_type(dtype)
         # determine the comm and device the data will be placed on
         device = devices.sanitize_device(device)
         comm = sanitize_comm(comm)
@@ -638,6 +636,9 @@ def load_hdf5(
             gshape = data.shape
             new_gshape = tuple()
             offsets = [0] * len(gshape)
+            if dtype is None:
+                dtype = data.dtype
+            dtype = types.canonical_heat_type(dtype)
             if slices is not None:
                 for i in range(len(gshape)):
                     if i < len(slices) and slices[i]:
@@ -688,7 +689,12 @@ def load_hdf5(
             return DNDarray(data, gshape, dtype, split, device, comm, balanced)
 
     def save_hdf5(
-        data: DNDarray, path: str, dataset: str, mode: str = "w", **kwargs: Dict[str, object]
+        data: DNDarray,
+        path: str,
+        dataset: str,
+        mode: str = "w",
+        dtype: Optional[datatype] = None,
+        **kwargs: Dict[str, object],
     ):
         """
         Saves ``data`` to an HDF5 file. Attempts to utilize parallel I/O if possible.
@@ -703,6 +709,8 @@ def save_hdf5(
             Name of the dataset the data is saved to.
         mode : str, optional
             File access mode, one of ``'w', 'a', 'r+'``
+        dtype : datatype, optional
+            Data type of the saved data
         kwargs : dict, optional
             Additional arguments passed to the created dataset.
 
@@ -733,16 +741,23 @@ def save_hdf5(
         is_split = data.split is not None
         _, _, slices = data.comm.chunk(data.gshape, data.split if is_split else 0)
 
+        if dtype is None:
+            dtype = data.dtype
+        elif type(dtype) == torch.dtype:
+            dtype = str(dtype).split(".")[-1]
+        if type(dtype) is not str:
+            dtype = dtype.__name__
+
         # attempt to perform parallel I/O if possible
         if h5py.get_config().mpi:
             with h5py.File(path, mode, driver="mpio", comm=data.comm.handle) as handle:
-                dset = handle.create_dataset(dataset, data.shape, **kwargs)
+                dset = handle.create_dataset(dataset, data.shape, dtype=dtype, **kwargs)
                 dset[slices] = data.larray.cpu() if is_split else data.larray[slices].cpu()
 
         # otherwise a single rank only write is performed in case of local data (i.e. no split)
         elif data.comm.rank == 0:
             with h5py.File(path, mode) as handle:
-                dset = handle.create_dataset(dataset, data.shape, **kwargs)
+                dset = handle.create_dataset(dataset, data.shape, dtype=dtype, **kwargs)
                 if is_split:
                     dset[slices] = data.larray.cpu()
                 else:
@@ -764,7 +779,7 @@ def save_hdf5(
             next_rank = (data.comm.rank + 1) % data.comm.size
             data.comm.Isend([None, 0, MPI.INT], dest=next_rank)
 
-    DNDarray.save_hdf5 = lambda self, path, dataset, mode="w", **kwargs: save_hdf5(
+    DNDarray.save_hdf5 = lambda self, path, dataset, mode="w", dtype=None, **kwargs: save_hdf5(
         self, path, dataset, mode, **kwargs
     )
     DNDarray.save_hdf5.__doc__ = save_hdf5.__doc__
diff --git a/heat/core/tests/test_io.py b/heat/core/tests/test_io.py
@@ -117,7 +117,7 @@ def test_size_from_slice(self):
     def test_load(self):
         # HDF5
         if ht.io.supports_hdf5():
-            iris = ht.load(self.HDF5_PATH, dataset="data")
+            iris = ht.load(self.HDF5_PATH, dataset="data", dtype=ht.float32)
             self.assertIsInstance(iris, ht.DNDarray)
             # shape invariant
             self.assertEqual(iris.shape, self.IRIS.shape)
@@ -602,7 +602,7 @@ def test_load_hdf5(self):
             self.skipTest("Requires HDF5")
 
         # default parameters
-        iris = ht.load_hdf5(self.HDF5_PATH, self.HDF5_DATASET)
+        iris = ht.load_hdf5(self.HDF5_PATH, self.HDF5_DATASET, dtype=ht.float32)
         self.assertIsInstance(iris, ht.DNDarray)
         self.assertEqual(iris.shape, self.IRIS.shape)
         self.assertEqual(iris.dtype, ht.float32)
@@ -613,13 +613,13 @@ def test_load_hdf5(self):
         iris = ht.load_hdf5(self.HDF5_PATH, self.HDF5_DATASET, split=0)
         self.assertIsInstance(iris, ht.DNDarray)
         self.assertEqual(iris.shape, self.IRIS.shape)
-        self.assertEqual(iris.dtype, ht.float32)
+        self.assertEqual(iris.dtype, ht.float64)
         lshape = iris.lshape
         self.assertLessEqual(lshape[0], self.IRIS.shape[0])
         self.assertEqual(lshape[1], self.IRIS.shape[1])
 
         # negative split axis
-        iris = ht.load_hdf5(self.HDF5_PATH, self.HDF5_DATASET, split=-1)
+        iris = ht.load_hdf5(self.HDF5_PATH, self.HDF5_DATASET, split=-1, dtype=ht.float32)
         self.assertIsInstance(iris, ht.DNDarray)
         self.assertEqual(iris.shape, self.IRIS.shape)
         self.assertEqual(iris.dtype, ht.float32)
@@ -661,7 +661,7 @@ def test_save_hdf5(self):
         # local unsplit data
         local_data = ht.arange(100)
         ht.save_hdf5(
-            local_data, self.HDF5_OUT_PATH, self.HDF5_DATASET, dtype=local_data.dtype.char()
+            local_data, self.HDF5_OUT_PATH, self.HDF5_DATASET, dtype=torch.int32
         )
         if local_data.comm.rank == 0:
             with ht.io.h5py.File(self.HDF5_OUT_PATH, "r") as handle:
@@ -673,7 +673,7 @@ def test_save_hdf5(self):
         # distributed data range
         split_data = ht.arange(100, split=0)
         ht.save_hdf5(
-            split_data, self.HDF5_OUT_PATH, self.HDF5_DATASET, dtype=split_data.dtype.char()
+            split_data, self.HDF5_OUT_PATH, self.HDF5_DATASET
         )
         if split_data.comm.rank == 0:
             with ht.io.h5py.File(self.HDF5_OUT_PATH, "r") as handle: