meta-pytorch
diff --git a/‎torchft/data.py‎
Lines changed: 105 additions & 2 deletions b/‎torchft/data.py‎
Lines changed: 105 additions & 2 deletions
diff --git a/‎torchft/data_test.py‎
Lines changed: 60 additions & 1 deletion b/‎torchft/data_test.py‎
Lines changed: 60 additions & 1 deletion
diff --git a/‎torchft/manager.py‎
Lines changed: 87 additions & 5 deletions b/‎torchft/manager.py‎
Lines changed: 87 additions & 5 deletions
@@ -14,11 +14,114 @@
 dataloader frequently to avoid duplicate batches.
 """
 
-from typing import Optional
-
+import torch
 import torch.distributed as dist
+from torch.utils.data.dataset import Dataset
+from torch.utils.data.sampler import Sampler
 from torch.utils import data
 
+import math
+from collections.abc import Iterator
+from typing import Optional, TypeVar
+
+_T_co = TypeVar("_T_co", covariant=True)
+
+class SkipDistributedSampler(Sampler[_T_co]):
+    def __init__(
+        self,
+        dataset: Dataset,
+        num_replicas: Optional[int] = None,
+        rank: Optional[int] = None,
+        shuffle: bool = True,
+        seed: int = 0,
+        drop_last: bool = False,
+        skip_samples: int = 0,
+    ) -> None:
+        if num_replicas is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            rank = dist.get_rank()
+        if rank >= num_replicas or rank < 0:
+            raise ValueError(
+                f"Invalid rank {rank}, rank should be in the interval [0, {num_replicas - 1}]"
+            )
+        self.dataset = dataset
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.drop_last = drop_last
+        self.skip_samples = skip_samples
+        # If the dataset length is evenly divisible by # of replicas, then there
+        # is no need to drop any data, since the dataset will be split equally.
+        if self.drop_last and len(self.dataset) % self.num_replicas != 0:  # type: ignore[arg-type]
+            # Split to nearest available length that is evenly divisible.
+            # This is to ensure each rank receives the same amount of data when
+            # using this Sampler.
+            self.num_samples = math.ceil(
+                (len(self.dataset) - self.skip_samples - self.num_replicas) / self.num_replicas  # type: ignore[arg-type]
+            )
+        else:
+            self.num_samples = math.ceil((len(self.dataset) - self.skip_samples) / self.num_replicas)  # type: ignore[arg-type]
+        self.total_size = self.num_samples * self.num_replicas
+        self.shuffle = shuffle
+        self.seed = seed
+
+    def __iter__(self) -> Iterator[_T_co]:
+        if self.shuffle:
+            # deterministically shuffle based on epoch and seed
+            g = torch.Generator()
+            g.manual_seed(self.seed + self.epoch)
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()  # type: ignore[arg-type]
+        else:
+            indices = list(range(len(self.dataset)))  # type: ignore[arg-type]
+
+        if not self.drop_last:
+            indices = indices[self.skip_samples: len(indices)]
+            # add extra samples to make it evenly divisible
+            padding_size = self.total_size - len(indices)
+            if padding_size <= len(indices):
+                indices += indices[:padding_size]
+            else:
+                indices += (indices * math.ceil(padding_size / len(indices)))[
+                    :padding_size
+                ]
+        else:
+            # remove tail of data to make it evenly divisible.
+            indices = indices[self.skip_samples : self.skip_samples + self.total_size]
+        if len(indices) != self.total_size:
+            raise AssertionError(
+                f"Number of indices ({len(indices)}) does not match total_size ({self.total_size})"
+            )
+
+        # subsample
+        indices = indices[self.rank : self.total_size : self.num_replicas]
+        if len(indices) != self.num_samples:
+            raise AssertionError(
+                f"Number of subsampled indices ({len(indices)}) does not match num_samples ({self.num_samples})"
+            )
+
+        # pyrefly: ignore  # bad-return
+        return iter(indices)
+
+    def __len__(self) -> int:
+        return self.num_samples
+
+    def set_epoch(self, epoch: int) -> None:
+        r"""
+        Set the epoch for this sampler.
+
+        When :attr:`shuffle=True`, this ensures all replicas
+        use a different random ordering for each epoch. Otherwise, the next iteration of this
+        sampler will yield the same ordering.
+
+        Args:
+            epoch (int): Epoch number.
+        """
+        self.epoch = epoch
 
 # pyre-fixme[24]: expected generic parameter
 class DistributedSampler(data.distributed.DistributedSampler):
 
@@ -8,7 +8,7 @@
 
 from torch.utils.data import Dataset
 
-from torchft.data import DistributedSampler
+from torchft.data import DistributedSampler, SkipDistributedSampler
 
 
 class DummyDataset(Dataset):
@@ -37,3 +37,62 @@ def test_distributed_sampler(self) -> None:
 
         sampler_iter = iter(sampler)
         self.assertEqual(next(sampler_iter), 500)
+
+    def test_skip_distributed_sampler(self):
+        dataset_length = 100
+        dataset = DummyDataset(dataset_length)
+
+        # Case 1: sample is not skipped
+        for drop_last in [True, False]:
+            num_replicas = 7
+            for rank in range(num_replicas):
+                sampler = SkipDistributedSampler(dataset=dataset, num_replicas=num_replicas,
+                                                 rank=rank, shuffle=False, drop_last=drop_last)
+                cur = rank
+                for idx in sampler:
+                    self.assertEqual(idx, (cur % dataset_length), f"idx={idx}, cur={cur}")
+                    cur += num_replicas
+                # If drop_last is True, read ceil((100-7)/7)*7=98 samples totally.
+                # If drop_last is False, read ceil(100/7)*7=105 samples totally.
+                if drop_last:
+                    self.assertEqual(cur, 98 + rank, f"rank={rank}, cur={cur}")
+                else:
+                    self.assertEqual(cur, 105 + rank, f"rank={rank}, cur={cur}")
+
+        # Case 2: sample is skipped
+        for drop_last in [True, False]:
+            num_replicas = 7
+            skip_samples = 10
+            for rank in range(num_replicas):
+                sampler = SkipDistributedSampler(dataset=dataset, num_replicas=num_replicas,
+                                                 rank=rank, shuffle=False, drop_last=drop_last,
+                                                 skip_samples=skip_samples)
+                cur = rank
+                for idx in sampler:
+                    expected = ((cur + skip_samples) % dataset_length + skip_samples) \
+                        if (cur + skip_samples) >= dataset_length else (cur + skip_samples)
+                    self.assertEqual(idx, expected, f"idx={idx}, expected={expected}")
+                    cur += num_replicas
+                # If drop_last is True, read ceil((100-10-7)/7)*7=84 samples totally.
+                # If drop_last is False, read ceil((100-10)/7)*7=91 samples totally.
+                if drop_last:
+                    self.assertEqual(cur, 84 + rank, f"rank={rank}, cur={cur}")
+                else:
+                    self.assertEqual(cur, 91 + rank, f"rank={rank}, cur={cur}")
+
+        # Case 3: drop last is False and padding size is larger than number of indices
+        # If skip_samples is 90, and num_replicas is 31, then the indices is [90, 92, ..., 99].
+        # It means only 10 samples are left, so padding size is 21 which is larger than 10.
+        num_replicas = 31
+        skip_samples = 90
+        expected = list(range(90, 100))
+        expected = (expected * 4)[:31]
+        for rank in range(num_replicas):
+            sampler = SkipDistributedSampler(dataset=dataset, num_replicas=num_replicas,
+                                             rank=rank, shuffle=False, drop_last=False,
+                                             skip_samples=skip_samples)
+            cnt = 0
+            for idx in sampler:
+                self.assertEqual(idx, expected[rank], f"idx={idx}, rank={rank}, expected={expected}")
+                cnt += 1
+            self.assertTrue(cnt, 1)
@@ -26,6 +26,7 @@
 """
 
 import concurrent.futures
+import gc
 import logging
 import os
 import socket
@@ -185,6 +186,7 @@ def __init__(
         init_sync: bool = True,
         max_retries: Optional[int] = None,
         quorum_retries: int = 0,
+        dataloader_fn: Optional[Callable[[int, int, int], None]] = None,
     ) -> None:
         """
         Args:
@@ -365,6 +367,17 @@ def __init__(
 
         self._update_fr_path()
 
+        # The number of batches committed in the current epoch.Compare to _batches_committed,
+        # _current_batches_committed will reset to 0 when next epoch starts.
+        self._current_batches_committed = 0
+        self._epoch = 0
+        self._loaded_epoch = 0
+        self._loaded_current_batches_committed = 0
+        self._dataloader_fn = dataloader_fn
+        self._dataloader_dirty = False
+        self._dataloader_iter = None
+        self._accumulation_steps = 1
+
     def allow_state_dict_read(self) -> None:
         if self._is_state_dict_read_allowed:
             return
@@ -438,6 +451,12 @@ def allreduce(
             return _DummyWork(tensor)
 
         self.wait_quorum()
+
+        # If dirty, the result will not be committed, so return empty tensor.
+        if self._dataloader_dirty:
+            work = _DummyWork(torch.zeros_like(tensor))
+            return _ManagedWork(self, work, tensor)
+
         num_participants: int = self.num_participants()
 
         if not self.is_participating():
@@ -678,6 +697,8 @@ def _async_quorum(
             if self._use_async_quorum or not allow_heal
             else (replica_rank, replica_world_size)
         )
+        self._replica_rank = replica_rank
+        self._replica_world_size = replica_world_size
 
         # For fixed with spares we need to ensure that we don't have more
         # participating replicas than the min replica size.
@@ -691,6 +712,7 @@ def _async_quorum(
             ):
                 self._participating_replica_rank = None
 
+        quorum_changed = False
         if quorum_id != self._quorum_id:
             self.quorum_logger.info(
                 "",
@@ -737,6 +759,7 @@ def _async_quorum(
                 self._logger.exception(f"got exception in pg configure: {e}")
                 self.report_error(e)
                 return
+            quorum_changed = True
 
         if allow_heal:
             # run recovery on the recovery stream if available
@@ -807,6 +830,38 @@ def _async_quorum(
                     else None
                 )
 
+        # reconfigure dataloader after healing so that we can get offset from other replica group
+        if quorum_changed and self._dataloader_fn:
+            self.reconfigure_dataloader()
+            self._dataloader_dirty = True
+
+    def get_batch_samples(self, epoch=0, num_batches=None, batch_size=None, total_batch_size=None):
+        # In general, `start_quorum` might not have been called during the first loop,
+        # and the dataloader might not have been initialized yet. In this case, we should
+        # return immediately and set the dirty flag to avoid computation and commit.
+        if not self._dataloader_iter:
+            self._dataloader_dirty = True
+            return []
+        # If the recovery worker is behind the current epoch, we should skip computation and commit.
+        if epoch < self._loaded_epoch:
+            return None
+
+        if total_batch_size != None and batch_size != None:
+            num_batches = total_batch_size // (batch_size * self._replica_world_size)
+
+        assert num_batches is not None, ("num_batches must be specified or "
+                                         "total_batch_size and batch_size must be specified")
+
+        batch_samples = []
+        for _ in range(num_batches):
+            try:
+                batch_samples.append(next(self._dataloader_iter))
+            except StopIteration:
+                break
+        self._dataloader_dirty = False
+        self._accumulation_steps = len(batch_samples)
+        return batch_samples if batch_samples else None
+
     def _update_fr_path(self) -> None:
         """
         Update the path that flight recorder will dump the traces to.
@@ -921,9 +976,14 @@ def should_commit(self, timeout: Optional[timedelta] = None) -> bool:
 
         # decide whether we're in a healthy state to increase the step count
         if should_commit:
-            self._step += 1
-            self._batches_committed += self.num_participants()
             self._commit_failures = 0  # Reset failure counter on success
+            if not self._dataloader_dirty:
+                self._step += 1
+                self._batches_committed += self.num_participants() * self._accumulation_steps
+                self._current_batches_committed += self.num_participants() * self._accumulation_steps
+                return True
+            else:
+                return False
         else:
             self._commit_failures += 1
             # Check if we've hit max retries
@@ -934,8 +994,7 @@ def should_commit(self, timeout: Optional[timedelta] = None) -> bool:
                 msg = f"should_commit failed {self._commit_failures} times consecutively, exceeding max_retries={self._max_retries}"
                 self._logger.exception(msg)
                 raise RuntimeError(msg)
-
-        return should_commit
+        return False
 
     def load_state_dict(self, state_dict: Dict[str, int]) -> None:
         """
@@ -948,6 +1007,11 @@ def load_state_dict(self, state_dict: Dict[str, int]) -> None:
         """
         self._step = state_dict["step"]
         self._batches_committed = state_dict["batches_committed"]
+        self._loaded_epoch = state_dict["epoch"]
+        self._loaded_current_batches_committed = state_dict["current_batches_committed"]
+        if self._loaded_epoch == 0:
+            self._epoch = 0
+            self._current_batches_committed = self._loaded_current_batches_committed
 
     def _manager_state_dict(self) -> Dict[str, object]:
         with self._state_dict_lock.r_lock():
@@ -969,7 +1033,8 @@ def state_dict(self) -> Dict[str, int]:
         Returns:
             the state dict for this manager
         """
-        return {"step": self._step, "batches_committed": self._batches_committed}
+        return {"step": self._step, "batches_committed": self._batches_committed,
+                "epoch": self._epoch, "current_batches_committed": self._current_batches_committed}
 
     def current_step(self) -> int:
         """
@@ -1047,6 +1112,23 @@ def is_participating(self) -> bool:
             return False
         return True
 
+    def reconfigure_dataloader(self):
+        dataloader = self._dataloader_fn(self._replica_world_size,
+            self._replica_rank, self._current_batches_committed)
+        dataloader.sampler.set_epoch(self._epoch)
+        self._dataloader_iter = iter(dataloader)
+        # cleanup for old dataloader
+        gc.collect()
+
+    def next_epoch(self):
+        self._epoch += 1
+        if self._loaded_epoch == self._epoch:
+            self._current_batches_committed = self._loaded_current_batches_committed
+        else:
+            self._current_batches_committed = 0
+        if self._dataloader_fn:
+            self.reconfigure_dataloader()
+            self._dataloader_dirty = False
 
 class _ManagerLogger:
     def __init__(self, manager: Manager, replica_id: str, group_rank: int) -> None: