From 585502d4955c02c3928e3b4735e80aced0705e10 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 26 Jun 2025 10:17:28 +0000
Subject: [PATCH 01/79] Created initial code for loading fused_dense module
 dynamically instead of building it. Code uses accelerator and op_builder
 modules from deepspeed code.

---
 accelerator/__init__.py             |   7 +
 accelerator/abstract_accelerator.py | 306 ++++++++++
 accelerator/cpu_accelerator.py      | 361 ++++++++++++
 accelerator/cuda_accelerator.py     | 385 ++++++++++++
 accelerator/real_accelerator.py     | 171 ++++++
 apex/accelerator                    |   1 +
 apex/csrc                           |   1 +
 apex/fused_dense/fused_dense.py     |   7 +-
 apex/git_version_info.py            |  31 +
 apex/op_builder                     |   1 +
 op_builder/__init__.py              |  53 ++
 op_builder/all_ops.py               |  28 +
 op_builder/builder.py               | 862 +++++++++++++++++++++++++++
 op_builder/fused_dense.py           |  40 ++
 setup.py                            | 868 +++++-----------------------
 15 files changed, 2397 insertions(+), 725 deletions(-)
 create mode 100644 accelerator/__init__.py
 create mode 100644 accelerator/abstract_accelerator.py
 create mode 100644 accelerator/cpu_accelerator.py
 create mode 100644 accelerator/cuda_accelerator.py
 create mode 100644 accelerator/real_accelerator.py
 create mode 120000 apex/accelerator
 create mode 120000 apex/csrc
 create mode 100644 apex/git_version_info.py
 create mode 120000 apex/op_builder
 create mode 100644 op_builder/__init__.py
 create mode 100644 op_builder/all_ops.py
 create mode 100644 op_builder/builder.py
 create mode 100644 op_builder/fused_dense.py

diff --git a/accelerator/__init__.py b/accelerator/__init__.py
new file mode 100644
index 000000000..e145afb03
--- /dev/null
+++ b/accelerator/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .abstract_accelerator import ApexAccelerator
+from .real_accelerator import get_accelerator, set_accelerator, is_current_accelerator_supported
\ No newline at end of file
diff --git a/accelerator/abstract_accelerator.py b/accelerator/abstract_accelerator.py
new file mode 100644
index 000000000..e8064de4a
--- /dev/null
+++ b/accelerator/abstract_accelerator.py
@@ -0,0 +1,306 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import abc
+from abc import ABC
+
+
+class ApexAccelerator(ABC):
+
+    def __init__(self):
+        self._name = None
+        self._communication_backend_name = None
+        self._compile_backend = None
+
+    @abc.abstractmethod
+    def is_synchronized_device(self):
+        ...
+
+    @abc.abstractmethod
+    def use_host_timers(self):
+        ...
+
+    @abc.abstractmethod
+    def resolves_data_dependency(self):
+        ...
+
+    @abc.abstractmethod
+    def handles_memory_backpressure(self):
+        ...
+
+    # Device APIs
+    @abc.abstractmethod
+    def device_name(self, device_index):
+        ...
+
+    @abc.abstractmethod
+    def device(self, device_index):
+        ...
+
+    @abc.abstractmethod
+    def set_device(self, device_index):
+        ...
+
+    @abc.abstractmethod
+    def current_device(self):
+        ...
+
+    @abc.abstractmethod
+    def current_device_name(self):
+        ...
+
+    @abc.abstractmethod
+    def device_count(self):
+        ...
+
+    @abc.abstractmethod
+    def synchronize(self, device_index=None):
+        ...
+
+    # RNG APIs
+    @abc.abstractmethod
+    def random(self):
+        ...
+
+    @abc.abstractmethod
+    def set_rng_state(self, new_state, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def get_rng_state(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def manual_seed(self, seed):
+        ...
+
+    @abc.abstractmethod
+    def manual_seed_all(self, seed):
+        ...
+
+    @abc.abstractmethod
+    def initial_seed(self):
+        ...
+
+    @abc.abstractmethod
+    def default_generator(self, device_index):
+        ...
+
+    # Streams/Events
+    @property
+    @abc.abstractmethod
+    def Stream(self):
+        ...
+
+    @abc.abstractmethod
+    def stream(self, stream):
+        ...
+
+    @abc.abstractmethod
+    def current_stream(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def default_stream(self, device_index=None):
+        ...
+
+    @property
+    @abc.abstractmethod
+    def Event(self):
+        ...
+
+    # Memory management
+    @abc.abstractmethod
+    def empty_cache(self):
+        ...
+
+    @abc.abstractmethod
+    def memory_allocated(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def max_memory_allocated(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def reset_max_memory_allocated(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def memory_cached(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def max_memory_cached(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def reset_max_memory_cached(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def memory_stats(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def reset_peak_memory_stats(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def memory_reserved(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def max_memory_reserved(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def total_memory(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def available_memory(self, device_index=None):
+        ...
+
+    # Data types
+    @abc.abstractmethod
+    def is_bf16_supported(self):
+        ...
+
+    @abc.abstractmethod
+    def is_fp16_supported(self):
+        ...
+
+    @abc.abstractmethod
+    def supported_dtypes(self):
+        ...
+
+    # Misc
+    @abc.abstractmethod
+    def amp(self):
+        ...
+
+    @abc.abstractmethod
+    def is_available(self):
+        ...
+
+    @abc.abstractmethod
+    def range_push(self, msg):
+        ...
+
+    @abc.abstractmethod
+    def range_pop(self):
+        ...
+
+    @abc.abstractmethod
+    def lazy_call(self, callback):
+        ...
+
+    @abc.abstractmethod
+    def communication_backend_name(self):
+        ...
+
+    @abc.abstractmethod
+    def is_triton_supported(self):
+        ...
+
+    # Graph operations
+    @abc.abstractmethod
+    def create_graph(self):
+        ...
+
+    @abc.abstractmethod
+    def capture_to_graph(self, graph, pool=None, stream=None):
+        ...
+
+    @abc.abstractmethod
+    def replay_graph(self, graph):
+        ...
+
+    # Tensor operations
+    @property
+    @abc.abstractmethod
+    def BFloat16Tensor(self):
+        ...
+
+    @property
+    @abc.abstractmethod
+    def ByteTensor(self):
+        ...
+
+    @property
+    @abc.abstractmethod
+    def DoubleTensor(self):
+        ...
+
+    @property
+    @abc.abstractmethod
+    def FloatTensor(self):
+        ...
+
+    @property
+    @abc.abstractmethod
+    def HalfTensor(self):
+        ...
+
+    @property
+    @abc.abstractmethod
+    def IntTensor(self):
+        ...
+
+    @property
+    @abc.abstractmethod
+    def LongTensor(self):
+        ...
+
+    @abc.abstractmethod
+    def pin_memory(self, tensor, align_bytes=1):
+        ...
+
+    @abc.abstractmethod
+    def is_pinned(self, tensor):
+        ...
+
+    @abc.abstractmethod
+    def on_accelerator(self, tensor):
+        ...
+
+    @abc.abstractmethod
+    def op_builder_dir(self):
+        ...
+
+    # create an instance of op builder, specified by class_name
+    @abc.abstractmethod
+    def create_op_builder(self, class_name):
+        ...
+
+    # return an op builder class, specified by class_name
+    @abc.abstractmethod
+    def get_op_builder(self, class_name):
+        ...
+
+    @abc.abstractmethod
+    def build_extension(self):
+        ...
+
+    @abc.abstractmethod
+    def export_envs(self):
+        ...
+
+    @abc.abstractmethod
+    def visible_devices_envs(self):
+        ...
+
+    @abc.abstractmethod
+    def set_visible_devices_envs(self, current_env, local_accelerator_ids):
+        ...
+
+    @abc.abstractmethod
+    def get_compile_backend(self):
+        ...
+
+    @abc.abstractmethod
+    def set_compile_backend(self, backend):
+        ...
\ No newline at end of file
diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py
new file mode 100644
index 000000000..5bd66926d
--- /dev/null
+++ b/accelerator/cpu_accelerator.py
@@ -0,0 +1,361 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .abstract_accelerator import ApexAccelerator
+
+# During setup stage torch may not be installed, pass on no torch will
+# allow op builder related API to be executed.
+try:
+    import torch
+except ImportError as e:
+    pass
+
+try:
+    import oneccl_bindings_for_pytorch  # noqa: F401 # type: ignore
+    oneccl_imported_p = True
+except ImportError as e:
+    oneccl_imported_p = False
+
+import os
+
+
+# accelerator for Intel CPU
+class CPU_Accelerator(ApexAccelerator):
+
+    def __init__(self):
+        self._name = 'cpu'
+        self._compile_backend = "inductor"
+        if oneccl_imported_p:
+            self._communication_backend_name = 'ccl'
+        else:
+            # fallback to gloo if oneccl_binding_for_pytorch is not installed
+            self._communication_backend_name = 'gloo'
+        try:
+            import psutil
+            mem = psutil.Process().memory_info().rss
+            self.max_mem = mem
+        except ImportError as e:
+            self.max_mem = 0
+
+    def is_synchronized_device(self):
+        return True
+
+    def use_host_timers(self):
+        return self.is_synchronized_device()
+
+    def resolves_data_dependency(self):
+        return self.is_synchronized_device()
+
+    def handles_memory_backpressure(self):
+        return self.is_synchronized_device()
+
+    # Device APIs
+    def device_name(self, device_index=None):
+        return 'cpu'
+
+    def device(self, device_index=None):
+        return None
+
+    def set_device(self, device_index):
+        return
+
+    def current_device(self):
+        return os.environ.get('LOCAL_RANK', 0)
+
+    def current_device_name(self):
+        return 'cpu'
+
+    def device_count(self):
+        device_count = int(os.environ.get('LOCAL_SIZE', 0))
+        if device_count > 0:
+            return device_count
+        else:
+            from deepspeed.utils.numa import get_numa_cores
+            # Count NUMA node for number of cpu accelerators. On machine with HBM
+            # In flat mode, HBM is in separate NUMA node with no cores on this node.
+            # Ignore these NUMA nodes with no cores.
+            numa_core_lists = get_numa_cores()
+            if not numa_core_lists:
+                return 1
+            numa_count = 0
+            prev_core_list = []
+            for core_list in numa_core_lists:
+                if len(core_list) > 0 and core_list != prev_core_list:
+                    numa_count += 1
+                    prev_core_list = core_list
+            return numa_count
+
+    def synchronize(self, device_index=None):
+        return
+
+    # RNG APIs
+    def random(self):
+        return torch.random
+
+    def set_rng_state(self, new_state, device_index=None):
+        if device_index is None:
+            return torch.set_rng_state(new_state)
+        return torch.set_rng_state(new_state, device_index)
+
+    def get_rng_state(self, device_index=None):
+        return torch.get_rng_state()
+
+    def manual_seed(self, seed):
+        return torch.manual_seed(seed)
+
+    def manual_seed_all(self, seed):
+        return torch.manual_seed(seed)
+
+    def initial_seed(self):
+        return torch.initial_seed()
+
+    def default_generator(self, device_index):
+        return torch.default_generator
+
+    # Streams/Events
+    @property
+    def Stream(self):
+        return None
+
+    def stream(self, stream):
+        from deepspeed.runtime.utils import noop_context
+        return noop_context()
+
+    def current_stream(self, device_index=None):
+        return None
+
+    def default_stream(self, device_index=None):
+        return None
+
+    @property
+    def Event(self):
+        return None
+
+    # Memory management
+    def empty_cache(self):
+        return
+
+    def get_rss(self):
+        import psutil
+        mem = psutil.Process().memory_info().rss
+        if mem > self.max_mem:
+            self.max_mem = mem
+        return mem
+
+    def reset_rss(self):
+        import psutil
+        mem = psutil.Process().memory_info().rss
+        self.max_mem = mem
+        return mem
+
+    def memory_allocated(self, device_index=None):
+        return self.get_rss()
+
+    def max_memory_allocated(self, device_index=None):
+        self.get_rss()
+        return self.max_mem
+
+    def reset_max_memory_allocated(self, device_index=None):
+        self.reset_rss()
+        return
+
+    def memory_cached(self, device_index=None):
+        return self.get_rss()
+
+    def max_memory_cached(self, device_index=None):
+        self.get_rss()
+        return self.max_mem
+
+    def reset_max_memory_cached(self, device_index=None):
+        self.reset_rss()
+        return
+
+    def memory_stats(self, device_index=None):
+        mem = self.get_rss()
+        mem_stat = {}
+        mem_stat['allocated_bytes.all.current'] = mem
+        mem_stat['allocated_bytes.all.peak'] = self.max_mem
+        return mem_stat
+
+    def reset_peak_memory_stats(self, device_index=None):
+        self.reset_rss()
+        return
+
+    def memory_reserved(self, device_index=None):
+        return self.get_rss()
+
+    def max_memory_reserved(self, device_index=None):
+        self.get_rss()
+        return self.max_mem
+
+    def total_memory(self, device_index=None):
+        import psutil
+        return psutil.virtual_memory().total
+
+    def available_memory(self, device_index=None):
+        import psutil
+        return psutil.virtual_memory().available
+
+    # Misc
+    def amp(self):
+        return torch.cpu.amp
+
+    def is_available(self):
+        return True
+
+    def range_push(self, msg):
+        # TODO itt is currently not supported yet
+        # return torch.profiler.itt.range_push(msg)
+        return
+
+    def range_pop(self):
+        # TODO itt is currently not supported yet
+        # return torch.profiler.itt.range_pop()
+        return
+
+    def lazy_call(self, callback):
+        return callback()
+
+    def communication_backend_name(self):
+        return self._communication_backend_name
+
+    def is_triton_supported(self):
+        return False
+
+    # Data types
+    def is_bf16_supported(self):
+        return True
+
+    def is_fp16_supported(self):
+        try:
+            if torch.ops.mkldnn._is_mkldnn_fp16_supported():
+                return True
+        except:
+            return False
+
+    def supported_dtypes(self):
+        supported_dtypes = [torch.float, torch.bfloat16]
+        if self.is_fp16_supported():
+            supported_dtypes.append(torch.float16)
+        return supported_dtypes
+
+    # Graph operations
+    def create_graph(self):
+        return None
+
+    def capture_to_graph(self, graph, pool=None, stream=None):
+        from deepspeed.runtime.utils import noop_context
+        return noop_context()
+
+    def replay_graph(self, graph):
+        return
+
+    # Tensor operations
+    @property
+    def BFloat16Tensor(self):
+        return torch.BFloat16Tensor
+
+    @property
+    def ByteTensor(self):
+        return torch.ByteTensor
+
+    @property
+    def DoubleTensor(self):
+        return torch.DoubleTensor
+
+    @property
+    def FloatTensor(self):
+        return torch.FloatTensor
+
+    @property
+    def HalfTensor(self):
+        return torch.HalfTensor
+
+    @property
+    def IntTensor(self):
+        return torch.IntTensor
+
+    @property
+    def LongTensor(self):
+        return torch.LongTensor
+
+    def pin_memory(self, tensor, align_bytes=1):
+        return tensor
+
+    def is_pinned(self, tensor):
+        return tensor.is_pinned()
+
+    def op_builder_dir(self):
+        try:
+            # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+            # if successful this also means we're doing a local install and not JIT compile path
+            from op_builder import __apex__  # noqa: F401 # type: ignore
+            return "op_builder.cpu"
+        except ImportError:
+            return "apex.op_builder.cpu"
+
+    def on_accelerator(self, tensor):
+        device_str = str(tensor.device)
+        if device_str.startswith('cpu'):
+            return True
+        else:
+            return False
+
+    # create an instance of op builder and return, name specified by class_name
+    def create_op_builder(self, op_name):
+        builder_class = self.get_op_builder(op_name)
+        if builder_class is not None:
+            return builder_class()
+        return None
+
+    # return an op builder class, name specified by class_name
+    def get_op_builder(self, class_name):
+        try:
+            # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+            # if successful this also means we're doing a local install and not JIT compile path
+            from op_builder import __deepspeed__  # noqa: F401 # type: ignore
+            from op_builder.cpu import AsyncIOBuilder, CCLCommBuilder, ShareMemCommBuilder, FusedAdamBuilder, CPUAdamBuilder, NotImplementedBuilder
+        except ImportError:
+            from deepspeed.ops.op_builder.cpu import AsyncIOBuilder, CCLCommBuilder, ShareMemCommBuilder, FusedAdamBuilder, CPUAdamBuilder, NotImplementedBuilder
+
+        if class_name == "CCLCommBuilder":
+            return CCLCommBuilder
+        elif class_name == "ShareMemCommBuilder":
+            return ShareMemCommBuilder
+        elif class_name == "FusedAdamBuilder":
+            return FusedAdamBuilder
+        elif class_name == "CPUAdamBuilder":
+            return CPUAdamBuilder
+        elif class_name == "AsyncIOBuilder":
+            return AsyncIOBuilder
+        else:
+            # return a NotImplementedBuilder to avoid get NoneType[Name] in unit tests
+            return NotImplementedBuilder
+
+    def build_extension(self):
+        from torch.utils.cpp_extension import BuildExtension
+        return BuildExtension
+
+    def export_envs(self):
+        return []
+
+    # TODO: cpu's visible envs is confirmed, keep as CUDA_VISIBLE_DEVICES
+    def visible_devices_envs(self):
+        return ['CUDA_VISIBLE_DEVICES']
+
+    def set_visible_devices_envs(self, current_env, local_accelerator_ids):
+        for env in self.visible_devices_envs():
+            current_env[env] = ",".join(map(str, local_accelerator_ids))
+
+    def get_compile_backend(self):
+        return self._compile_backend
+
+    def set_compile_backend(self, backend):
+        supported_backends = torch._dynamo.list_backends(exclude_tags=())
+        if backend in supported_backends:
+            self._compile_backend = backend
+        else:
+            raise ValueError(
+                f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}")
\ No newline at end of file
diff --git a/accelerator/cuda_accelerator.py b/accelerator/cuda_accelerator.py
new file mode 100644
index 000000000..48dacb83b
--- /dev/null
+++ b/accelerator/cuda_accelerator.py
@@ -0,0 +1,385 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import functools
+import os
+import pkgutil
+import importlib
+import sys
+
+from .abstract_accelerator import ApexAccelerator
+# During setup stage torch may not be installed, pass on no torch will
+# allow op builder related API to be executed.
+try:
+    import torch.cuda
+except ImportError:
+    pass
+
+# Delay import pynvml to avoid import error when CUDA is not available
+pynvml = None
+
+
+class CUDA_Accelerator(ApexAccelerator):
+
+    def __init__(self):
+        self._name = 'cuda'
+        self._communication_backend_name = 'nccl' if sys.platform != 'win32' else 'gloo'
+        self._compile_backend = "inductor"
+        if pynvml is None:
+            self._init_pynvml()
+
+    def _init_pynvml(self):
+        global pynvml
+        try:
+            import pynvml
+        except ImportError:
+            return
+        try:
+            pynvml.nvmlInit()
+        except pynvml.NVMLError:
+            pynvml = None
+            return
+
+    def is_synchronized_device(self):
+        return False
+
+    def use_host_timers(self):
+        return self.is_synchronized_device()
+
+    def resolves_data_dependency(self):
+        return self.is_synchronized_device()
+
+    def handles_memory_backpressure(self):
+        return self.is_synchronized_device()
+
+    # Device APIs
+    def device_name(self, device_index=None):
+        if device_index is None:
+            return 'cuda'
+        return 'cuda:{}'.format(device_index)
+
+    def communication_backend_version(self):
+        return torch.cuda.nccl.version()
+
+    def device(self, device_index=None):
+        return torch.cuda.device(device_index)
+
+    def set_device(self, device_index):
+        torch.cuda.set_device(device_index)
+
+    def current_device(self):
+        return torch.cuda.current_device()
+
+    def current_device_name(self):
+        return 'cuda:{}'.format(torch.cuda.current_device())
+
+    def device_count(self):
+        return torch.cuda.device_count()
+
+    def synchronize(self, device_index=None):
+        return torch.cuda.synchronize(device_index)
+
+    # RNG APIs
+    def random(self):
+        return torch.random
+
+    def set_rng_state(self, new_state, device_index=None):
+        if device_index is None:
+            return torch.cuda.set_rng_state(new_state)
+
+        return torch.cuda.set_rng_state(new_state, device_index)
+
+    def get_rng_state(self, device_index=None):
+        if device_index is None:
+            return torch.cuda.get_rng_state()
+
+        return torch.cuda.get_rng_state(device_index)
+
+    def manual_seed(self, seed):
+        return torch.cuda.manual_seed(seed)
+
+    def manual_seed_all(self, seed):
+        return torch.cuda.manual_seed_all(seed)
+
+    def initial_seed(self):
+        return torch.cuda.initial_seed()
+
+    def default_generator(self, device_index):
+        return torch.cuda.default_generators[device_index]
+
+    # Streams/Events
+    @property
+    def Stream(self):
+        return torch.cuda.Stream
+
+    def stream(self, stream):
+        return torch.cuda.stream(stream)
+
+    def current_stream(self, device_index=None):
+        return torch.cuda.current_stream(device_index)
+
+    def default_stream(self, device_index=None):
+        return torch.cuda.default_stream(device_index)
+
+    @property
+    def Event(self):
+        return torch.cuda.Event
+
+    # Memory management
+    def empty_cache(self):
+        return torch.cuda.empty_cache()
+
+    def memory_allocated(self, device_index=None):
+        return torch.cuda.memory_allocated(device_index)
+
+    def max_memory_allocated(self, device_index=None):
+        return torch.cuda.max_memory_allocated(device_index)
+
+    def reset_max_memory_allocated(self, device_index=None):
+        return torch.cuda.reset_max_memory_allocated(device_index)
+
+    def memory_cached(self, device_index=None):
+        return torch.cuda.memory_cached(device_index)
+
+    def max_memory_cached(self, device_index=None):
+        return torch.cuda.max_memory_cached(device_index)
+
+    def reset_max_memory_cached(self, device_index=None):
+        return torch.cuda.reset_max_memory_cached(device_index)
+
+    def memory_stats(self, device_index=None):
+        if hasattr(torch.cuda, 'memory_stats'):
+            return torch.cuda.memory_stats(device_index)
+
+    def reset_peak_memory_stats(self, device_index=None):
+        if hasattr(torch.cuda, 'reset_peak_memory_stats'):
+            return torch.cuda.reset_peak_memory_stats(device_index)
+
+    def memory_reserved(self, device_index=None):
+        if hasattr(torch.cuda, 'memory_reserved'):
+            return torch.cuda.memory_reserved(device_index)
+
+    def max_memory_reserved(self, device_index=None):
+        if hasattr(torch.cuda, 'max_memory_reserved'):
+            return torch.cuda.max_memory_reserved(device_index)
+
+    def total_memory(self, device_index=None):
+        return torch.cuda.get_device_properties(device_index).total_memory
+
+    def _get_nvml_gpu_id(self, torch_gpu_id):
+        """
+        credit: https://discuss.pytorch.org/t/making-pynvml-match-torch-device-ids-cuda-visible-devices/103020
+
+        Remap torch device id to nvml device id, respecting CUDA_VISIBLE_DEVICES.
+
+        If the latter isn't set return the same id
+        """
+        # if CUDA_VISIBLE_DEVICES is used automagically remap the id since pynvml ignores this env var
+        if "CUDA_VISIBLE_DEVICES" in os.environ:
+            ids = list(map(int, os.environ.get("CUDA_VISIBLE_DEVICES", "").split(",")))
+            return ids[torch_gpu_id]  # remap
+        else:
+            return torch_gpu_id
+
+    def available_memory(self, device_index=None):
+        if pynvml:
+            if device_index is None:
+                device_index = self.current_device()
+            handle = pynvml.nvmlDeviceGetHandleByIndex(self._get_nvml_gpu_id(device_index))
+            info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+            return info.free
+        else:
+            return self.total_memory(device_index) - self.memory_allocated(device_index)
+
+    # Data types
+    def is_bf16_supported(self):
+        if not torch.cuda.is_available():
+            return True
+        return torch.cuda.is_bf16_supported()
+
+    def is_fp16_supported(self):
+        if not torch.cuda.is_available():
+            return True
+        # See https://docs.nvidia.com/deeplearning/tensorrt/support-matrix/index.html#hardware-precision-matrix
+        # FP16 on compute capability 6.x is deprecated
+        allow_deprecated_fp16 = os.environ.get('DS_ALLOW_DEPRECATED_FP16', '0') == '1'
+        major, _ = torch.cuda.get_device_capability()
+        if major >= 7:
+            return True
+        elif major == 6 and allow_deprecated_fp16:
+            return True
+        else:
+            return False
+
+    def supported_dtypes(self):
+        supported_dtypes = [torch.float]
+        if self.is_fp16_supported():
+            supported_dtypes.append(torch.half)
+        if self.is_bf16_supported():
+            supported_dtypes.append(torch.bfloat16)
+        return supported_dtypes
+
+    # Misc
+    def amp(self):
+        if hasattr(torch.cuda, 'amp'):
+            return torch.cuda.amp
+        return None
+
+    def is_available(self):
+        return torch.cuda.is_available()
+
+    def range_push(self, msg):
+        if hasattr(torch.cuda.nvtx, 'range_push'):
+            return torch.cuda.nvtx.range_push(msg)
+
+    def range_pop(self):
+        if hasattr(torch.cuda.nvtx, 'range_pop'):
+            return torch.cuda.nvtx.range_pop()
+
+    def lazy_call(self, callback):
+        return torch.cuda._lazy_call(callback)
+
+    def communication_backend_name(self):
+        return self._communication_backend_name
+
+    def is_triton_supported(self):
+        major, _ = torch.cuda.get_device_capability()
+        if major >= 8:
+            return True
+        else:
+            return False
+
+    # Graph operations
+    def create_graph(self):
+        return torch.cuda.CUDAGraph()
+
+    def capture_to_graph(self, graph, pool=None, stream=None):
+        return torch.cuda.graph(graph, pool, stream)
+
+    def replay_graph(self, graph):
+        graph.replay()
+        return
+
+    # Tensor operations
+
+    @property
+    def BFloat16Tensor(self):
+        return functools.partial(torch.tensor, dtype=torch.bfloat16, device='cuda')
+
+    @property
+    def ByteTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.uint8, device='cuda')
+
+    @property
+    def DoubleTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.double, device='cuda')
+
+    @property
+    def FloatTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.float, device='cuda')
+
+    @property
+    def HalfTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.half, device='cuda')
+
+    @property
+    def IntTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.int, device='cuda')
+
+    @property
+    def LongTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.long, device='cuda')
+
+    def pin_memory(self, tensor, align_bytes=1):
+        return tensor.pin_memory()
+
+    def is_pinned(self, tensor):
+        return tensor.is_pinned()
+
+    def on_accelerator(self, tensor):
+        device_str = str(tensor.device)
+        if device_str.startswith('cuda:'):
+            return True
+        else:
+            return False
+
+    def op_builder_dir(self):
+        try:
+            # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+            # if successful this also means we're doing a local install and not JIT compile path
+            from op_builder import __apex__  # noqa: F401 # type: ignore
+            return "op_builder"
+        except ImportError:
+            return "apex.op_builder"
+
+    # dict that holds class name <--> class type mapping i.e.
+    # 'AsyncIOBuilder': <class 'op_builder.async_io.AsyncIOBuilder'>
+    # this dict will be filled at init stage
+    class_dict = None
+
+    def _lazy_init_class_dict(self):
+        if self.class_dict is not None:
+            return
+        else:
+            self.class_dict = {}
+            # begin initialize for create_op_builder()
+            # put all valid class name <--> class type mapping into class_dict
+            op_builder_dir = self.op_builder_dir()
+            op_builder_module = importlib.import_module(op_builder_dir)
+            op_builder_absolute_path = os.path.dirname(op_builder_module.__file__)
+            for _, module_name, _ in pkgutil.iter_modules([op_builder_absolute_path]):
+                # avoid self references,
+                # skip sub_directories which contains ops for other backend(cpu, npu, etc.).
+                if module_name != 'all_ops' and module_name != 'builder' and not os.path.isdir(
+                        os.path.join(op_builder_absolute_path, module_name)):
+                    module = importlib.import_module("{}.{}".format(op_builder_dir, module_name))
+                    for member_name in module.__dir__():
+                        if member_name.endswith(
+                                'Builder'
+                        ) and member_name != "OpBuilder" and member_name != "CUDAOpBuilder" and member_name != "TorchCPUOpBuilder":  # avoid abstract classes
+                            if not member_name in self.class_dict:
+                                self.class_dict[member_name] = getattr(module, member_name)
+            # end initialize for create_op_builder()
+
+    # create an instance of op builder and return, name specified by class_name
+    def create_op_builder(self, class_name):
+        self._lazy_init_class_dict()
+        if class_name in self.class_dict:
+            return self.class_dict[class_name]()
+        else:
+            return None
+
+    # return an op builder class, name specified by class_name
+    def get_op_builder(self, class_name):
+        self._lazy_init_class_dict()
+        if class_name in self.class_dict:
+            return self.class_dict[class_name]
+        else:
+            return None
+
+    def build_extension(self):
+        from torch.utils.cpp_extension import BuildExtension
+        return BuildExtension
+
+    def export_envs(self):
+        return ['NCCL']
+
+    def visible_devices_envs(self):
+        return ['CUDA_VISIBLE_DEVICES']
+
+    def set_visible_devices_envs(self, current_env, local_accelerator_ids):
+        for env in self.visible_devices_envs():
+            current_env[env] = ",".join(map(str, local_accelerator_ids))
+
+    def get_compile_backend(self):
+        return self._compile_backend
+
+    def set_compile_backend(self, backend):
+        supported_backends = torch._dynamo.list_backends(exclude_tags=())
+        if backend in supported_backends:
+            self._compile_backend = backend
+        else:
+            raise ValueError(
+                f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}")
\ No newline at end of file
diff --git a/accelerator/real_accelerator.py b/accelerator/real_accelerator.py
new file mode 100644
index 000000000..aaba7cf23
--- /dev/null
+++ b/accelerator/real_accelerator.py
@@ -0,0 +1,171 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+import os
+
+try:
+    # Importing logger currently requires that torch is installed, hence the try...except
+    # TODO: Remove logger dependency on torch.
+    from deepspeed.utils import logger as accel_logger
+except ImportError as e:
+    accel_logger = None
+
+try:
+    from accelerator.abstract_accelerator import ApexAccelerator as dsa1
+except ImportError as e:
+    dsa1 = None
+try:
+    from apex.accelerator.abstract_accelerator import ApexAccelerator as dsa2
+except ImportError as e:
+    dsa2 = None
+
+SUPPORTED_ACCELERATOR_LIST = ['cuda', 'cpu']
+
+ds_accelerator = None
+
+
+def _validate_accelerator(accel_obj):
+    # because abstract_accelerator has different path during
+    # build time (accelerator.abstract_accelerator)
+    # and run time (deepspeed.accelerator.abstract_accelerator)
+    # and extension would import the
+    # run time abstract_accelerator/DeepSpeedAccelerator as its base
+    # class, so we need to compare accel_obj with both base class.
+    # if accel_obj is instance of DeepSpeedAccelerator in one of
+    # accelerator.abstractor_accelerator
+    # or deepspeed.accelerator.abstract_accelerator, consider accel_obj
+    # is a conforming object
+    if not ((dsa1 is not None and isinstance(accel_obj, dsa1)) or (dsa2 is not None and isinstance(accel_obj, dsa2))):
+        raise AssertionError(f"{accel_obj.__class__.__name__} accelerator is not subclass of ApexAccelerator")
+
+    # TODO: turn off is_available test since this breaks tests
+    # assert accel_obj.is_available(), \
+    #    f'{accel_obj.__class__.__name__} accelerator fails is_available() test'
+
+
+def is_current_accelerator_supported():
+    return get_accelerator().device_name() in SUPPORTED_ACCELERATOR_LIST
+
+
+def get_accelerator():
+    global ds_accelerator
+    if ds_accelerator is not None:
+        return ds_accelerator
+
+    accelerator_name = None
+    ds_set_method = None
+    # 1. Detect whether there is override of DeepSpeed accelerators from environment variable.
+    if "DS_ACCELERATOR" in os.environ.keys():
+        accelerator_name = os.environ["DS_ACCELERATOR"]
+        if accelerator_name == "cpu":
+            pass 
+        elif accelerator_name not in SUPPORTED_ACCELERATOR_LIST:
+            raise ValueError(f'DS_ACCELERATOR must be one of {SUPPORTED_ACCELERATOR_LIST}. '
+                             f'Value "{accelerator_name}" is not supported')
+        ds_set_method = "override"
+
+    # 2. If no override, detect which accelerator to use automatically
+    if accelerator_name is None:
+        # We need a way to choose among different accelerator types.
+        # Currently we detect which accelerator extension is installed
+        # in the environment and use it if the installing answer is True.
+        # An alternative might be detect whether CUDA device is installed on
+        # the system but this comes with two pitfalls:
+        # 1. the system may not have torch pre-installed, so
+        #    get_accelerator().is_available() may not work.
+        # 2. Some scenario like install on login node (without CUDA device)
+        #    and run on compute node (with CUDA device) may cause mismatch
+        #    between installation time and runtime.
+
+        if accelerator_name is None:
+            try:
+                import torch
+
+                # Determine if we are on a GPU or x86 CPU with torch.
+                # "torch.cuda.is_available()" provides a stronger guarantee,     #ignore-cuda
+                # ensuring that we are free from CUDA initialization errors.
+                # While "torch.cuda.device_count() > 0" check ensures that       #ignore-cuda
+                # we won't try to do any CUDA calls when no device is available
+                # For reference: https://github.com/deepspeedai/DeepSpeed/pull/6810
+                if torch.cuda.device_count() > 0 and torch.cuda.is_available():  #ignore-cuda
+                    accelerator_name = "cuda"
+            except (RuntimeError, ImportError) as e:
+                # TODO need a more decent way to detect which accelerator to use, consider using nvidia-smi command for detection
+                pass
+        if accelerator_name is None:
+            # borrow this log from PR#5084
+            if accel_logger is not None:
+                accel_logger.warning(
+                    "Setting accelerator to CPU. If you have GPU or other accelerator, we were unable to detect it.")
+            # cpu added as catch-all when accelerator detection fails
+            accelerator_name = "cpu"
+
+        ds_set_method = "auto detect"
+
+    # 3. Set ds_accelerator accordingly
+    if accelerator_name == "cuda":
+        from .cuda_accelerator import CUDA_Accelerator
+
+        ds_accelerator = CUDA_Accelerator()
+    elif accelerator_name == "cpu":
+        from .cpu_accelerator import CPU_Accelerator
+
+        ds_accelerator = CPU_Accelerator()
+    _validate_accelerator(ds_accelerator)
+    if accel_logger is not None:
+        accel_logger.info(f"Setting ds_accelerator to {ds_accelerator._name} ({ds_set_method})")
+    return ds_accelerator
+
+
+def set_accelerator(accel_obj):
+    global ds_accelerator
+    _validate_accelerator(accel_obj)
+    if accel_logger is not None:
+        accel_logger.info(f"Setting ds_accelerator to {accel_obj._name} (model specified)")
+    ds_accelerator = accel_obj
+
+
+"""
+-----------[code] test_get.py -----------
+from deepspeed.accelerator import get_accelerator
+my_accelerator = get_accelerator()
+logger.info(f'{my_accelerator._name=}')
+logger.info(f'{my_accelerator._communication_backend=}')
+logger.info(f'{my_accelerator.HalfTensor().device=}')
+logger.info(f'{my_accelerator.total_memory()=}')
+-----------[code] test_get.py -----------
+
+---[output] python test_get.py---------
+my_accelerator.name()='cuda'
+my_accelerator.communication_backend='nccl'
+my_accelerator.HalfTensor().device=device(type='cuda', index=0)
+my_accelerator.total_memory()=34089730048
+---[output] python test_get.py---------
+
+**************************************************************************
+-----------[code] test_set.py -----------
+from deepspeed.accelerator.cuda_accelerator import CUDA_Accelerator
+cu_accel = CUDA_Accelerator()
+logger.info(f'{id(cu_accel)=}')
+from deepspeed.accelerator import set_accelerator, get_accelerator
+set_accelerator(cu_accel)
+
+my_accelerator = get_accelerator()
+logger.info(f'{id(my_accelerator)=}')
+logger.info(f'{my_accelerator._name=}')
+logger.info(f'{my_accelerator._communication_backend=}')
+logger.info(f'{my_accelerator.HalfTensor().device=}')
+logger.info(f'{my_accelerator.total_memory()=}')
+-----------[code] test_set.py -----------
+
+
+---[output] python test_set.py---------
+id(cu_accel)=139648165478304
+my_accelerator=<deepspeed.accelerator.cuda_accelerator.CUDA_Accelerator object at 0x7f025f4bffa0>
+my_accelerator.name='cuda'
+my_accelerator.communication_backend='nccl'
+my_accelerator.HalfTensor().device=device(type='cuda', index=0)
+my_accelerator.total_memory()=34089730048
+---[output] python test_set.py---------
+"""
\ No newline at end of file
diff --git a/apex/accelerator b/apex/accelerator
new file mode 120000
index 000000000..14bf59231
--- /dev/null
+++ b/apex/accelerator
@@ -0,0 +1 @@
+../accelerator
\ No newline at end of file
diff --git a/apex/csrc b/apex/csrc
new file mode 120000
index 000000000..e96d28eb5
--- /dev/null
+++ b/apex/csrc
@@ -0,0 +1 @@
+../csrc
\ No newline at end of file
diff --git a/apex/fused_dense/fused_dense.py b/apex/fused_dense/fused_dense.py
index 97377a423..f19aae6da 100644
--- a/apex/fused_dense/fused_dense.py
+++ b/apex/fused_dense/fused_dense.py
@@ -1,9 +1,14 @@
 import torch
 from torch import nn
-import fused_dense_cuda
+#import fused_dense_cuda
+from apex.op_builder import FusedDenseBuilder
 from apex._autocast_utils import _cast_if_autocast_enabled
 import math 
 
+
+fused_dense_cuda = FusedDenseBuilder().load()
+
+
 #implements fused GEMM+bias in forward pass using mlp_cuda from apex
 class FusedDenseFunc(torch.autograd.Function):
     @staticmethod
diff --git a/apex/git_version_info.py b/apex/git_version_info.py
new file mode 100644
index 000000000..1a2b76cd1
--- /dev/null
+++ b/apex/git_version_info.py
@@ -0,0 +1,31 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+try:
+    #  This is populated by setup.py
+    from .git_version_info_installed import *  # noqa: F401 # type: ignore
+except ModuleNotFoundError:
+    import os
+    if os.path.isfile('version.txt'):
+        # Will be missing from checkouts that haven't been installed (e.g., readthedocs)
+        version = open('version.txt', 'r').read().strip()
+    else:
+        version = "0.0.0"
+    git_hash = '[none]'
+    git_branch = '[none]'
+
+    from .op_builder.all_ops import ALL_OPS
+    installed_ops = dict.fromkeys(ALL_OPS.keys(), False)
+    accelerator_name = ""
+    torch_info = {'version': "0.0", "cuda_version": "0.0", "hip_version": "0.0"}
+
+# compatible_ops list is recreated for each launch
+from .op_builder.all_ops import ALL_OPS
+
+compatible_ops = dict.fromkeys(ALL_OPS.keys(), False)
+for op_name, builder in ALL_OPS.items():
+    op_compatible = builder.is_compatible()
+    compatible_ops[op_name] = op_compatible
+    compatible_ops["apex_not_implemented"] = False
\ No newline at end of file
diff --git a/apex/op_builder b/apex/op_builder
new file mode 120000
index 000000000..1e19f3e8d
--- /dev/null
+++ b/apex/op_builder
@@ -0,0 +1 @@
+../op_builder
\ No newline at end of file
diff --git a/op_builder/__init__.py b/op_builder/__init__.py
new file mode 100644
index 000000000..9ca584f37
--- /dev/null
+++ b/op_builder/__init__.py
@@ -0,0 +1,53 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import sys
+import os
+import pkgutil
+import importlib
+
+from .builder import get_default_compute_capabilities, OpBuilder
+
+# Do not remove, required for abstract accelerator to detect if we have a deepspeed or 3p op_builder
+__apex__ = True
+
+# List of all available op builders from deepspeed op_builder
+try:
+    import apex.op_builder  # noqa: F401 # type: ignore
+    op_builder_dir = "apex.op_builder"
+except ImportError:
+    op_builder_dir = "op_builder"
+
+__op_builders__ = []
+
+this_module = sys.modules[__name__]
+
+
+def builder_closure(member_name):
+    if op_builder_dir == "op_builder":
+        # during installation time cannot get builder due to torch not installed,
+        # return closure instead
+        def _builder():
+            from apex.accelerator import get_accelerator
+            builder = get_accelerator().create_op_builder(member_name)
+            return builder
+
+        return _builder
+    else:
+        # during runtime, return op builder class directly
+        from apex.accelerator import get_accelerator
+        builder = get_accelerator().get_op_builder(member_name)
+        return builder
+
+
+# reflect builder names and add builder closure, such as 'TransformerBuilder()' creates op builder wrt current accelerator
+for _, module_name, _ in pkgutil.iter_modules([os.path.dirname(this_module.__file__)]):
+    if module_name != 'all_ops' and module_name != 'builder':
+        module = importlib.import_module(f".{module_name}", package=op_builder_dir)
+        for member_name in module.__dir__():
+            if member_name.endswith('Builder') and member_name != "OpBuilder" and member_name != "CUDAOpBuilder":
+                # assign builder name to variable with same name
+                # the following is equivalent to i.e. TransformerBuilder = "TransformerBuilder"
+                this_module.__dict__[member_name] = builder_closure(member_name)
\ No newline at end of file
diff --git a/op_builder/all_ops.py b/op_builder/all_ops.py
new file mode 100644
index 000000000..66b34d5bc
--- /dev/null
+++ b/op_builder/all_ops.py
@@ -0,0 +1,28 @@
+import os
+import pkgutil
+import importlib
+try:
+    # during installation time accelerator is visible, otherwise return deepspeed.accelerator
+    from accelerator import get_accelerator
+except ImportError:
+    from apex.accelerator import get_accelerator
+
+# List of all available ops
+
+# reflect all builder names into __op_builders__
+op_builder_dir = get_accelerator().op_builder_dir()
+op_builder_module = importlib.import_module(op_builder_dir)
+__op_builders__ = []
+
+for _, module_name, _ in pkgutil.iter_modules([os.path.dirname(op_builder_module.__file__)]):
+    # avoid self references
+    if module_name != 'all_ops' and module_name != 'builder':
+        module = importlib.import_module("{}.{}".format(op_builder_dir, module_name))
+        for member_name in module.__dir__():
+            if member_name.endswith('Builder'):
+                # append builder to __op_builders__ list
+                builder = get_accelerator().create_op_builder(member_name)
+                __op_builders__.append(builder)
+                
+ALL_OPS = {op.name: op for op in __op_builders__ if op is not None}
+accelerator_name = get_accelerator()._name
\ No newline at end of file
diff --git a/op_builder/builder.py b/op_builder/builder.py
new file mode 100644
index 000000000..1fd7a4504
--- /dev/null
+++ b/op_builder/builder.py
@@ -0,0 +1,862 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import os
+import re
+import sys
+import time
+import importlib
+from pathlib import Path
+import subprocess
+import shlex
+import shutil
+import tempfile
+import distutils.ccompiler
+import distutils.log
+import distutils.sysconfig
+from distutils.errors import CompileError, LinkError
+from abc import ABC, abstractmethod
+from typing import List
+
+YELLOW = '\033[93m'
+END = '\033[0m'
+WARNING = f"{YELLOW} [WARNING] {END}"
+
+DEFAULT_TORCH_EXTENSION_PATH = "/tmp/torch_extensions"
+DEFAULT_COMPUTE_CAPABILITIES = "6.0;6.1;7.0"
+
+try:
+    import torch
+except ImportError:
+    print(f"{WARNING} unable to import torch, please install it if you want to pre-compile any deepspeed ops.")
+else:
+    TORCH_MAJOR = int(torch.__version__.split('.')[0])
+    TORCH_MINOR = int(torch.__version__.split('.')[1])
+
+
+class MissingCUDAException(Exception):
+    pass
+
+
+class CUDAMismatchException(Exception):
+    pass
+
+
+def installed_cuda_version(name=""):
+    import torch.utils.cpp_extension
+    cuda_home = torch.utils.cpp_extension.CUDA_HOME
+    if cuda_home is None:
+        raise MissingCUDAException("CUDA_HOME does not exist, unable to compile CUDA op(s)")
+    # Ensure there is not a cuda version mismatch between torch and nvcc compiler
+    output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"], universal_newlines=True)
+    output_split = output.split()
+    release_idx = output_split.index("release")
+    release = output_split[release_idx + 1].replace(',', '').split(".")
+    # Ignore patch versions, only look at major + minor
+    cuda_major, cuda_minor = release[:2]
+    return int(cuda_major), int(cuda_minor)
+
+
+def get_default_compute_capabilities():
+    compute_caps = DEFAULT_COMPUTE_CAPABILITIES
+    # Update compute capability according to: https://en.wikipedia.org/wiki/CUDA#GPUs_supported
+    import torch.utils.cpp_extension
+    if torch.utils.cpp_extension.CUDA_HOME is not None:
+        if installed_cuda_version()[0] == 11:
+            if installed_cuda_version()[1] >= 0:
+                compute_caps += ";8.0"
+            if installed_cuda_version()[1] >= 1:
+                compute_caps += ";8.6"
+            if installed_cuda_version()[1] >= 8:
+                compute_caps += ";9.0"
+        elif installed_cuda_version()[0] == 12:
+            compute_caps += ";8.0;8.6;9.0"
+            if installed_cuda_version()[1] >= 8:
+                compute_caps += ";10.0;12.0"
+    return compute_caps
+
+
+# list compatible minor CUDA versions - so that for example pytorch built with cuda-11.0 can be used
+# to build deepspeed and system-wide installed cuda 11.2
+cuda_minor_mismatch_ok = {
+    10: ["10.0", "10.1", "10.2"],
+    11: ["11.0", "11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8"],
+    12: ["12.0", "12.1", "12.2", "12.3", "12.4", "12.5", "12.6",
+         "12.8"],  # There does not appear to be a CUDA Toolkit 12.7
+}
+
+
+def assert_no_cuda_mismatch(name=""):
+    cuda_major, cuda_minor = installed_cuda_version(name)
+    sys_cuda_version = f'{cuda_major}.{cuda_minor}'
+    torch_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
+    # This is a show-stopping error, should probably not proceed past this
+    if sys_cuda_version != torch_cuda_version:
+        if (cuda_major in cuda_minor_mismatch_ok and sys_cuda_version in cuda_minor_mismatch_ok[cuda_major]
+                and torch_cuda_version in cuda_minor_mismatch_ok[cuda_major]):
+            print(f"Installed CUDA version {sys_cuda_version} does not match the "
+                  f"version torch was compiled with {torch.version.cuda} "
+                  "but since the APIs are compatible, accepting this combination")
+            return True
+        elif os.getenv("DS_SKIP_CUDA_CHECK", "0") == "1":
+            print(
+                f"{WARNING} DeepSpeed Op Builder: Installed CUDA version {sys_cuda_version} does not match the "
+                f"version torch was compiled with {torch.version.cuda}."
+                "Detected `DS_SKIP_CUDA_CHECK=1`: Allowing this combination of CUDA, but it may result in unexpected behavior."
+            )
+            return True
+        raise CUDAMismatchException(
+            f">- DeepSpeed Op Builder: Installed CUDA version {sys_cuda_version} does not match the "
+            f"version torch was compiled with {torch.version.cuda}, unable to compile "
+            "cuda/cpp extensions without a matching cuda version.")
+    return True
+
+
+class OpBuilder(ABC):
+    _rocm_version = None
+    _rocm_gpu_arch = None
+    _rocm_wavefront_size = None
+    _is_rocm_pytorch = None
+    _is_sycl_enabled = None
+    _loaded_ops = {}
+
+    def __init__(self, name):
+        self.name = name
+        self.jit_mode = False
+        self.build_for_cpu = False
+        self.enable_bf16 = False
+        self.error_log = None
+
+    @abstractmethod
+    def absolute_name(self):
+        '''
+        Returns absolute build path for cases where the op is pre-installed, e.g., deepspeed.ops.adam.cpu_adam
+        will be installed as something like: deepspeed/ops/adam/cpu_adam.so
+        '''
+        pass
+
+    @abstractmethod
+    def sources(self):
+        '''
+        Returns list of source files for your op, relative to root of deepspeed package (i.e., DeepSpeed/deepspeed)
+        '''
+        pass
+
+    def hipify_extension(self):
+        pass
+
+    def sycl_extension(self):
+        pass
+
+    @staticmethod
+    def validate_torch_version(torch_info):
+        install_torch_version = torch_info['version']
+        current_torch_version = ".".join(torch.__version__.split('.')[:2])
+        if install_torch_version != current_torch_version:
+            raise RuntimeError("PyTorch version mismatch! DeepSpeed ops were compiled and installed "
+                               "with a different version than what is being used at runtime. "
+                               f"Please re-install DeepSpeed or switch torch versions. "
+                               f"Install torch version={install_torch_version}, "
+                               f"Runtime torch version={current_torch_version}")
+
+    @staticmethod
+    def validate_torch_op_version(torch_info):
+        if not OpBuilder.is_rocm_pytorch():
+            current_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
+            install_cuda_version = torch_info['cuda_version']
+            if install_cuda_version != current_cuda_version:
+                raise RuntimeError("CUDA version mismatch! DeepSpeed ops were compiled and installed "
+                                   "with a different version than what is being used at runtime. "
+                                   f"Please re-install DeepSpeed or switch torch versions. "
+                                   f"Install CUDA version={install_cuda_version}, "
+                                   f"Runtime CUDA version={current_cuda_version}")
+        else:
+            current_hip_version = ".".join(torch.version.hip.split('.')[:2])
+            install_hip_version = torch_info['hip_version']
+            if install_hip_version != current_hip_version:
+                raise RuntimeError("HIP version mismatch! DeepSpeed ops were compiled and installed "
+                                   "with a different version than what is being used at runtime. "
+                                   f"Please re-install DeepSpeed or switch torch versions. "
+                                   f"Install HIP version={install_hip_version}, "
+                                   f"Runtime HIP version={current_hip_version}")
+
+    @staticmethod
+    def is_rocm_pytorch():
+        if OpBuilder._is_rocm_pytorch is not None:
+            return OpBuilder._is_rocm_pytorch
+
+        _is_rocm_pytorch = False
+        try:
+            import torch
+        except ImportError:
+            pass
+        else:
+            if TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 5):
+                _is_rocm_pytorch = hasattr(torch.version, 'hip') and torch.version.hip is not None
+                if _is_rocm_pytorch:
+                    from torch.utils.cpp_extension import ROCM_HOME
+                    _is_rocm_pytorch = ROCM_HOME is not None
+        OpBuilder._is_rocm_pytorch = _is_rocm_pytorch
+        return OpBuilder._is_rocm_pytorch
+
+    @staticmethod
+    def is_sycl_enabled():
+        if OpBuilder._is_sycl_enabled is not None:
+            return OpBuilder._is_sycl_enabled
+
+        _is_sycl_enabled = False
+        try:
+            result = subprocess.run(["c2s", "--version"], capture_output=True)
+        except:
+            pass
+        else:
+            _is_sycl_enabled = True
+
+        OpBuilder._is_sycl_enabled = _is_sycl_enabled
+        return OpBuilder._is_sycl_enabled
+
+    @staticmethod
+    def installed_rocm_version():
+        if OpBuilder._rocm_version:
+            return OpBuilder._rocm_version
+
+        ROCM_MAJOR = '0'
+        ROCM_MINOR = '0'
+        ROCM_VERSION_DEV_RAW = ""
+        if OpBuilder.is_rocm_pytorch():
+            from torch.utils.cpp_extension import ROCM_HOME
+            rocm_ver_file = Path(ROCM_HOME).joinpath(".info/version")
+            if rocm_ver_file.is_file():
+                with open(rocm_ver_file, 'r') as file:
+                    ROCM_VERSION_DEV_RAW = file.read()
+            elif "rocm" in torch.__version__:
+                ROCM_VERSION_DEV_RAW = torch.__version__.split("rocm")[1]
+            if ROCM_VERSION_DEV_RAW != "":
+                ROCM_MAJOR = ROCM_VERSION_DEV_RAW.split('.')[0]
+                ROCM_MINOR = ROCM_VERSION_DEV_RAW.split('.')[1]
+            else:
+                # Look in /usr/include/rocm-version.h
+                rocm_ver_file = Path("/usr/include/rocm_version.h")
+                if rocm_ver_file.is_file():
+                    with open(rocm_ver_file, 'r') as file:
+                        for ln in file.readlines():
+                            if "#define ROCM_VERSION_MAJOR" in ln:
+                                ROCM_MAJOR = re.findall(r'\S+', ln)[2]
+                            elif "#define ROCM_VERSION_MINOR" in ln:
+                                ROCM_MINOR = re.findall(r'\S+', ln)[2]
+            if ROCM_MAJOR == '0':
+                assert False, "Could not detect ROCm version"
+
+        OpBuilder._rocm_version = (int(ROCM_MAJOR), int(ROCM_MINOR))
+        return OpBuilder._rocm_version
+
+    @staticmethod
+    def get_rocm_gpu_arch():
+        if OpBuilder._rocm_gpu_arch:
+            return OpBuilder._rocm_gpu_arch
+        rocm_info = Path("/opt/rocm/bin/rocminfo")
+        if (not rocm_info.is_file()):
+            rocm_info = Path("rocminfo")
+        rocm_gpu_arch_cmd = str(rocm_info) + " | grep -o -m 1 'gfx.*'"
+        try:
+            result = subprocess.check_output(rocm_gpu_arch_cmd, shell=True)
+            rocm_gpu_arch = result.decode('utf-8').strip()
+        except subprocess.CalledProcessError:
+            rocm_gpu_arch = ""
+        OpBuilder._rocm_gpu_arch = rocm_gpu_arch
+        return OpBuilder._rocm_gpu_arch
+
+    @staticmethod
+    def get_rocm_wavefront_size():
+        if OpBuilder._rocm_wavefront_size:
+            return OpBuilder._rocm_wavefront_size
+
+        rocm_info = Path("/opt/rocm/bin/rocminfo")
+        if (not rocm_info.is_file()):
+            rocm_info = Path("rocminfo")
+        rocm_wavefront_size_cmd = str(
+            rocm_info) + " | grep -Eo -m1 'Wavefront Size:[[:space:]]+[0-9]+' | grep -Eo '[0-9]+'"
+        try:
+            result = subprocess.check_output(rocm_wavefront_size_cmd, shell=True)
+            rocm_wavefront_size = result.decode('utf-8').strip()
+        except subprocess.CalledProcessError:
+            rocm_wavefront_size = "32"
+        OpBuilder._rocm_wavefront_size = rocm_wavefront_size
+        return OpBuilder._rocm_wavefront_size
+
+    def include_paths(self):
+        '''
+        Returns list of include paths, relative to root of deepspeed package (i.e., DeepSpeed/deepspeed)
+        '''
+        return []
+
+    def nvcc_args(self):
+        '''
+        Returns optional list of compiler flags to forward to nvcc when building CUDA sources
+        '''
+        return []
+
+    def cxx_args(self):
+        '''
+        Returns optional list of compiler flags to forward to the build
+        '''
+        return []
+
+    def is_compatible(self, verbose=False):
+        '''
+        Check if all non-python dependencies are satisfied to build this op
+        '''
+        return True
+
+    def extra_ldflags(self):
+        return []
+
+    def has_function(self, funcname, libraries, library_dirs=None, verbose=False):
+        '''
+        Test for existence of a function within a tuple of libraries.
+
+        This is used as a smoke test to check whether a certain library is available.
+        As a test, this creates a simple C program that calls the specified function,
+        and then distutils is used to compile that program and link it with the specified libraries.
+        Returns True if both the compile and link are successful, False otherwise.
+        '''
+        tempdir = None  # we create a temporary directory to hold various files
+        filestderr = None  # handle to open file to which we redirect stderr
+        oldstderr = None  # file descriptor for stderr
+        try:
+            # Echo compile and link commands that are used.
+            if verbose:
+                distutils.log.set_verbosity(1)
+
+            # Create a compiler object.
+            compiler = distutils.ccompiler.new_compiler(verbose=verbose)
+
+            # Configure compiler and linker to build according to Python install.
+            distutils.sysconfig.customize_compiler(compiler)
+
+            # Create a temporary directory to hold test files.
+            tempdir = tempfile.mkdtemp()
+
+            # Define a simple C program that calls the function in question
+            prog = "void %s(void); int main(int argc, char** argv) { %s(); return 0; }" % (funcname, funcname)
+
+            # Write the test program to a file.
+            filename = os.path.join(tempdir, 'test.c')
+            with open(filename, 'w') as f:
+                f.write(prog)
+
+            # Redirect stderr file descriptor to a file to silence compile/link warnings.
+            if not verbose:
+                filestderr = open(os.path.join(tempdir, 'stderr.txt'), 'w')
+                oldstderr = os.dup(sys.stderr.fileno())
+                os.dup2(filestderr.fileno(), sys.stderr.fileno())
+
+            # Workaround for behavior in distutils.ccompiler.CCompiler.object_filenames()
+            # Otherwise, a local directory will be used instead of tempdir
+            drive, driveless_filename = os.path.splitdrive(filename)
+            root_dir = driveless_filename[0] if os.path.isabs(driveless_filename) else ''
+            output_dir = os.path.join(drive, root_dir)
+
+            # Attempt to compile the C program into an object file.
+            cflags = shlex.split(os.environ.get('CFLAGS', ""))
+            objs = compiler.compile([filename], output_dir=output_dir, extra_preargs=self.strip_empty_entries(cflags))
+
+            # Attempt to link the object file into an executable.
+            # Be sure to tack on any libraries that have been specified.
+            ldflags = shlex.split(os.environ.get('LDFLAGS', ""))
+            compiler.link_executable(objs,
+                                     os.path.join(tempdir, 'a.out'),
+                                     extra_preargs=self.strip_empty_entries(ldflags),
+                                     libraries=libraries,
+                                     library_dirs=library_dirs)
+
+            # Compile and link succeeded
+            return True
+
+        except CompileError:
+            return False
+
+        except LinkError:
+            return False
+
+        except:
+            return False
+
+        finally:
+            # Restore stderr file descriptor and close the stderr redirect file.
+            if oldstderr is not None:
+                os.dup2(oldstderr, sys.stderr.fileno())
+            if filestderr is not None:
+                filestderr.close()
+
+            # Delete the temporary directory holding the test program and stderr files.
+            if tempdir is not None:
+                shutil.rmtree(tempdir)
+
+    def strip_empty_entries(self, args):
+        '''
+        Drop any empty strings from the list of compile and link flags
+        '''
+        return [x for x in args if len(x) > 0]
+
+    def cpu_arch(self):
+        try:
+            from cpuinfo import get_cpu_info
+        except ImportError as e:
+            cpu_info = self._backup_cpuinfo()
+            if cpu_info is None:
+                return "-march=native"
+
+        try:
+            cpu_info = get_cpu_info()
+        except Exception as e:
+            self.warning(f"{self.name} attempted to use py-cpuinfo but failed (exception type: {type(e)}, {e}), "
+                         "falling back to lscpu to get this information.")
+            cpu_info = self._backup_cpuinfo()
+            if cpu_info is None:
+                return "-march=native"
+
+        if cpu_info['arch'].startswith('PPC_'):
+            # gcc does not provide -march on PowerPC, use -mcpu instead
+            return '-mcpu=native'
+        return '-march=native'
+
+    def get_cuda_compile_flag(self):
+        try:
+            if not self.is_rocm_pytorch():
+                assert_no_cuda_mismatch(self.name)
+                return "-D__ENABLE_CUDA__"
+        except MissingCUDAException:
+            print(f"{WARNING} {self.name} cuda is missing or is incompatible with installed torch, "
+                  "only cpu ops can be compiled!")
+            return '-D__DISABLE_CUDA__'
+        return '-D__DISABLE_CUDA__'
+
+    def _backup_cpuinfo(self):
+        # Construct cpu_info dict from lscpu that is similar to what py-cpuinfo provides
+        if not self.command_exists('lscpu'):
+            self.warning(f"{self.name} attempted to query 'lscpu' after failing to use py-cpuinfo "
+                         "to detect the CPU architecture. 'lscpu' does not appear to exist on "
+                         "your system, will fall back to use -march=native and non-vectorized execution.")
+            return None
+        result = subprocess.check_output(['lscpu'])
+        result = result.decode('utf-8').strip().lower()
+
+        cpu_info = {}
+        cpu_info['arch'] = None
+        cpu_info['flags'] = ""
+        if 'genuineintel' in result or 'authenticamd' in result:
+            cpu_info['arch'] = 'X86_64'
+            if 'avx512' in result:
+                cpu_info['flags'] += 'avx512,'
+            elif 'avx512f' in result:
+                cpu_info['flags'] += 'avx512f,'
+            if 'avx2' in result:
+                cpu_info['flags'] += 'avx2'
+        elif 'ppc64le' in result:
+            cpu_info['arch'] = "PPC_"
+
+        return cpu_info
+
+    def simd_width(self):
+        try:
+            from cpuinfo import get_cpu_info
+        except ImportError as e:
+            cpu_info = self._backup_cpuinfo()
+            if cpu_info is None:
+                return '-D__SCALAR__'
+
+        try:
+            cpu_info = get_cpu_info()
+        except Exception as e:
+            self.warning(f"{self.name} attempted to use py-cpuinfo but failed (exception type: {type(e)}, {e}), "
+                         "falling back to lscpu to get this information.")
+            cpu_info = self._backup_cpuinfo()
+            if cpu_info is None:
+                return '-D__SCALAR__'
+
+        if cpu_info['arch'] == 'X86_64':
+            if 'avx512' in cpu_info['flags'] or 'avx512f' in cpu_info['flags']:
+                return '-D__AVX512__'
+            elif 'avx2' in cpu_info['flags']:
+                return '-D__AVX256__'
+        return '-D__SCALAR__'
+
+    def command_exists(self, cmd):
+        if '|' in cmd:
+            cmds = cmd.split("|")
+        else:
+            cmds = [cmd]
+        valid = False
+        for cmd in cmds:
+            safe_cmd = ["bash", "-c", f"type {cmd}"]
+            result = subprocess.Popen(safe_cmd, stdout=subprocess.PIPE)
+            valid = valid or result.wait() == 0
+
+        if not valid and len(cmds) > 1:
+            print(f"{WARNING} {self.name} requires one of the following commands '{cmds}', but it does not exist!")
+        elif not valid and len(cmds) == 1:
+            print(f"{WARNING} {self.name} requires the '{cmd}' command, but it does not exist!")
+        return valid
+
+    def warning(self, msg):
+        self.error_log = f"{msg}"
+        print(f"{WARNING} {msg}")
+
+    def deepspeed_src_path(self, code_path):
+        if os.path.isabs(code_path):
+            return code_path
+        else:
+            return os.path.join(Path(__file__).parent.parent.absolute(), code_path)
+
+    def builder(self):
+        from torch.utils.cpp_extension import CppExtension
+        include_dirs = [os.path.abspath(x) for x in self.strip_empty_entries(self.include_paths())]
+        return CppExtension(name=self.absolute_name(),
+                            sources=self.strip_empty_entries(self.sources()),
+                            include_dirs=include_dirs,
+                            extra_compile_args={'cxx': self.strip_empty_entries(self.cxx_args())},
+                            extra_link_args=self.strip_empty_entries(self.extra_ldflags()))
+
+    def load(self, verbose=True):
+        if self.name in __class__._loaded_ops:
+            return __class__._loaded_ops[self.name]
+
+        from apex.git_version_info import installed_ops, torch_info, accelerator_name
+        from apex.accelerator import get_accelerator
+        if installed_ops.get(self.name, False) and accelerator_name == get_accelerator()._name:
+            # Ensure the op we're about to load was compiled with the same
+            # torch/cuda versions we are currently using at runtime.
+            self.validate_torch_version(torch_info)
+            if torch.cuda.is_available() and isinstance(self, CUDAOpBuilder):
+                self.validate_torch_op_version(torch_info)
+
+            op_module = importlib.import_module(self.absolute_name())
+            __class__._loaded_ops[self.name] = op_module
+            return op_module
+        else:
+            return self.jit_load(verbose)
+
+    def jit_load(self, verbose=True):
+        if not self.is_compatible(verbose):
+            raise RuntimeError(
+                f"Unable to JIT load the {self.name} op due to it not being compatible due to hardware/software issue. {self.error_log}"
+            )
+        try:
+            import ninja  # noqa: F401 # type: ignore
+        except ImportError:
+            raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")
+
+        if isinstance(self, CUDAOpBuilder) and not self.is_rocm_pytorch():
+            self.build_for_cpu = not torch.cuda.is_available()
+
+        self.jit_mode = True
+        from torch.utils.cpp_extension import load
+
+        start_build = time.time()
+        sources = [os.path.abspath(self.deepspeed_src_path(path)) for path in self.sources()]
+        extra_include_paths = [os.path.abspath(self.deepspeed_src_path(path)) for path in self.include_paths()]
+
+        # Torch will try and apply whatever CCs are in the arch list at compile time,
+        # we have already set the intended targets ourselves we know that will be
+        # needed at runtime. This prevents CC collisions such as multiple __half
+        # implementations. Stash arch list to reset after build.
+        torch_arch_list = None
+        if "TORCH_CUDA_ARCH_LIST" in os.environ:
+            torch_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST")
+            os.environ["TORCH_CUDA_ARCH_LIST"] = ""
+
+        nvcc_args = self.strip_empty_entries(self.nvcc_args())
+        cxx_args = self.strip_empty_entries(self.cxx_args())
+
+        cxx_args.append("-UC10_USE_GLOG")
+        nvcc_args.append("-UC10_USE_GLOG")
+        if isinstance(self, CUDAOpBuilder):
+            if not self.build_for_cpu and self.enable_bf16:
+                cxx_args.append("-DBF16_AVAILABLE")
+                nvcc_args.append("-DBF16_AVAILABLE")
+                nvcc_args.append("-U__CUDA_NO_BFLOAT16_OPERATORS__")
+                nvcc_args.append("-U__CUDA_NO_BFLOAT162_OPERATORS__")
+                nvcc_args.append("-U__CUDA_NO_BFLOAT16_CONVERSIONS__")
+
+        if self.is_rocm_pytorch():
+            cxx_args.append("-D__HIP_PLATFORM_AMD__=1")
+            os.environ["PYTORCH_ROCM_ARCH"] = self.get_rocm_gpu_arch()
+            cxx_args.append('-DROCM_WAVEFRONT_SIZE=%s' % self.get_rocm_wavefront_size())
+
+        op_module = load(name=self.name,
+                         sources=self.strip_empty_entries(sources),
+                         extra_include_paths=self.strip_empty_entries(extra_include_paths),
+                         extra_cflags=cxx_args,
+                         extra_cuda_cflags=nvcc_args,
+                         extra_ldflags=self.strip_empty_entries(self.extra_ldflags()),
+                         with_cuda=True if (isinstance(self, CUDAOpBuilder) and not self.build_for_cpu) else None,
+                         verbose=verbose)
+
+        build_duration = time.time() - start_build
+        if verbose:
+            print(f"Time to load {self.name} op: {build_duration} seconds")
+
+        # Reset arch list so we are not silently removing it for other possible use cases
+        if torch_arch_list:
+            os.environ["TORCH_CUDA_ARCH_LIST"] = torch_arch_list
+
+        __class__._loaded_ops[self.name] = op_module
+
+        return op_module
+
+
+class CUDAOpBuilder(OpBuilder):
+
+    def compute_capability_args(self, cross_compile_archs=None):
+        """
+        Returns nvcc compute capability compile flags.
+
+        1. `TORCH_CUDA_ARCH_LIST` takes priority over `cross_compile_archs`.
+        2. If neither is set default compute capabilities will be used
+        3. Under `jit_mode` compute capabilities of all visible cards will be used plus PTX
+
+        Format:
+
+        - `TORCH_CUDA_ARCH_LIST` may use ; or whitespace separators. Examples:
+
+        TORCH_CUDA_ARCH_LIST="6.1;7.5;8.6;9.0;10.0" pip install ...
+        TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6 9.0 10.0+PTX" pip install ...
+
+        - `cross_compile_archs` uses ; separator.
+
+        """
+        ccs = []
+        if self.jit_mode:
+            # Compile for underlying architectures since we know those at runtime
+            for i in range(torch.cuda.device_count()):
+                CC_MAJOR, CC_MINOR = torch.cuda.get_device_capability(i)
+                cc = f"{CC_MAJOR}.{CC_MINOR}"
+                if cc not in ccs:
+                    ccs.append(cc)
+            ccs = sorted(ccs)
+            ccs[-1] += '+PTX'
+        else:
+            # Cross-compile mode, compile for various architectures
+            # env override takes priority
+            cross_compile_archs_env = os.environ.get('TORCH_CUDA_ARCH_LIST', None)
+            if cross_compile_archs_env is not None:
+                if cross_compile_archs is not None:
+                    print(
+                        f"{WARNING} env var TORCH_CUDA_ARCH_LIST={cross_compile_archs_env} overrides cross_compile_archs={cross_compile_archs}"
+                    )
+                cross_compile_archs = cross_compile_archs_env.replace(' ', ';')
+            else:
+                if cross_compile_archs is None:
+                    cross_compile_archs = get_default_compute_capabilities()
+            ccs = cross_compile_archs.split(';')
+
+        ccs = self.filter_ccs(ccs)
+        if len(ccs) == 0:
+            raise RuntimeError(
+                f"Unable to load {self.name} op due to no compute capabilities remaining after filtering")
+
+        args = []
+        self.enable_bf16 = True
+        for cc in ccs:
+            num = cc[0] + cc[1].split('+')[0]
+            args.append(f'-gencode=arch=compute_{num},code=sm_{num}')
+            if cc[1].endswith('+PTX'):
+                args.append(f'-gencode=arch=compute_{num},code=compute_{num}')
+
+            if int(cc[0]) <= 7:
+                self.enable_bf16 = False
+
+        return args
+
+    def filter_ccs(self, ccs: List[str]):
+        """
+        Prune any compute capabilities that are not compatible with the builder. Should log
+        which CCs have been pruned.
+        """
+        return [cc.split('.') for cc in ccs]
+
+    def version_dependent_macros(self):
+        # Fix from apex that might be relevant for us as well, related to https://github.com/NVIDIA/apex/issues/456
+        version_ge_1_1 = []
+        if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
+            version_ge_1_1 = ['-DVERSION_GE_1_1']
+        version_ge_1_3 = []
+        if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
+            version_ge_1_3 = ['-DVERSION_GE_1_3']
+        version_ge_1_5 = []
+        if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
+            version_ge_1_5 = ['-DVERSION_GE_1_5']
+        return version_ge_1_1 + version_ge_1_3 + version_ge_1_5
+
+    def is_compatible(self, verbose=False):
+        return super().is_compatible(verbose)
+
+    def builder(self):
+        try:
+            if not self.is_rocm_pytorch():
+                assert_no_cuda_mismatch(self.name)
+            self.build_for_cpu = False
+        except MissingCUDAException:
+            self.build_for_cpu = True
+
+        if self.build_for_cpu:
+            from torch.utils.cpp_extension import CppExtension as ExtensionBuilder
+        else:
+            from torch.utils.cpp_extension import CUDAExtension as ExtensionBuilder
+        include_dirs = [os.path.abspath(x) for x in self.strip_empty_entries(self.include_paths())]
+        compile_args = {'cxx': self.strip_empty_entries(self.cxx_args())} if self.build_for_cpu else \
+                       {'cxx': self.strip_empty_entries(self.cxx_args()), \
+                        'nvcc': self.strip_empty_entries(self.nvcc_args())}
+
+        if not self.build_for_cpu and self.enable_bf16:
+            compile_args['cxx'].append("-DBF16_AVAILABLE")
+            compile_args['nvcc'].append("-DBF16_AVAILABLE")
+
+        if self.is_rocm_pytorch():
+            compile_args['cxx'].append("-D__HIP_PLATFORM_AMD__=1")
+            #cxx compiler args are required to compile cpp files
+            compile_args['cxx'].append('-DROCM_WAVEFRONT_SIZE=%s' % self.get_rocm_wavefront_size())
+            #nvcc compiler args are required to compile hip files
+            compile_args['nvcc'].append('-DROCM_WAVEFRONT_SIZE=%s' % self.get_rocm_wavefront_size())
+            if self.get_rocm_gpu_arch():
+                os.environ["PYTORCH_ROCM_ARCH"] = self.get_rocm_gpu_arch()
+
+        cuda_ext = ExtensionBuilder(name=self.absolute_name(),
+                                    sources=self.strip_empty_entries(self.sources()),
+                                    include_dirs=include_dirs,
+                                    libraries=self.strip_empty_entries(self.libraries_args()),
+                                    extra_compile_args=compile_args,
+                                    extra_link_args=self.strip_empty_entries(self.extra_ldflags()))
+
+        if self.is_rocm_pytorch():
+            # hip converts paths to absolute, this converts back to relative
+            sources = cuda_ext.sources
+            curr_file = Path(__file__).parent.parent  # ds root
+            for i in range(len(sources)):
+                src = Path(sources[i])
+                if src.is_absolute():
+                    sources[i] = str(src.relative_to(curr_file))
+                else:
+                    sources[i] = str(src)
+            cuda_ext.sources = sources
+        return cuda_ext
+
+    def hipify_extension(self):
+        if self.is_rocm_pytorch():
+            from torch.utils.hipify import hipify_python
+            hipify_python.hipify(
+                project_directory=os.getcwd(),
+                output_directory=os.getcwd(),
+                header_include_dirs=self.include_paths(),
+                includes=[os.path.join(os.getcwd(), '*')],
+                extra_files=[os.path.abspath(s) for s in self.sources()],
+                show_detailed=True,
+                is_pytorch_extension=True,
+                hipify_extra_files_only=True,
+            )
+
+    def cxx_args(self):
+        if sys.platform == "win32":
+            return ['-O2']
+        else:
+            return ['-O3', '-std=c++17', '-g', '-Wno-reorder']
+
+    def nvcc_args(self):
+        if self.build_for_cpu:
+            return []
+        args = ['-O3']
+        if self.is_rocm_pytorch():
+            ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
+            args += [
+                '-std=c++17', '-U__HIP_NO_HALF_OPERATORS__', '-U__HIP_NO_HALF_CONVERSIONS__',
+                '-U__HIP_NO_HALF2_OPERATORS__',
+                '-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR,
+                '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR
+            ]
+        else:
+            try:
+                nvcc_threads = int(os.getenv("DS_NVCC_THREADS", ""))
+                if nvcc_threads <= 0:
+                    raise ValueError("")
+            except ValueError:
+                nvcc_threads = min(os.cpu_count(), 8)
+
+            cuda_major, cuda_minor = installed_cuda_version()
+            if cuda_major > 10:
+                if cuda_major == 12 and cuda_minor >= 5:
+                    std_lib = '-std=c++20'
+                else:
+                    std_lib = '-std=c++17'
+            else:
+                std_lib = '-std=c++14'
+            args += [
+                '-allow-unsupported-compiler' if sys.platform == "win32" else '', '--use_fast_math', std_lib,
+                '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__',
+                f'--threads={nvcc_threads}'
+            ]
+            if os.environ.get('DS_DEBUG_CUDA_BUILD', '0') == '1':
+                args.append('--ptxas-options=-v')
+            args += self.compute_capability_args()
+        return args
+
+    def libraries_args(self):
+        if self.build_for_cpu:
+            return []
+
+        if sys.platform == "win32":
+            return ['cublas', 'curand']
+        else:
+            return []
+
+
+class TorchCPUOpBuilder(CUDAOpBuilder):
+
+    def get_cuda_lib64_path(self):
+        import torch
+        if not self.is_rocm_pytorch():
+            CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
+            if not os.path.exists(CUDA_LIB64):
+                CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib")
+        else:
+            CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.ROCM_HOME, "lib")
+        return CUDA_LIB64
+
+    def extra_ldflags(self):
+        if self.build_for_cpu:
+            return ['-fopenmp']
+
+        if not self.is_rocm_pytorch():
+            ld_flags = ['-lcurand']
+            if not self.build_for_cpu:
+                ld_flags.append(f'-L{self.get_cuda_lib64_path()}')
+            return ld_flags
+
+        return []
+
+    def cxx_args(self):
+        args = []
+        if not self.build_for_cpu:
+            CUDA_LIB64 = self.get_cuda_lib64_path()
+
+            args += super().cxx_args()
+            args += [
+                f'-L{CUDA_LIB64}',
+                '-lcudart',
+                '-lcublas',
+                '-g',
+            ]
+
+        CPU_ARCH = self.cpu_arch()
+        SIMD_WIDTH = self.simd_width()
+        CUDA_ENABLE = self.get_cuda_compile_flag()
+        args += [
+            CPU_ARCH,
+            '-fopenmp',
+            SIMD_WIDTH,
+            CUDA_ENABLE,
+        ]
+
+        return args
\ No newline at end of file
diff --git a/op_builder/fused_dense.py b/op_builder/fused_dense.py
new file mode 100644
index 000000000..2b8ef178f
--- /dev/null
+++ b/op_builder/fused_dense.py
@@ -0,0 +1,40 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class FusedDenseBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'DS_BUILD_FUSED_DENSE'
+    NAME = "fused_dense_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ['csrc/fused_dense_base.cpp', 'csrc/fused_dense_cuda.cu']
+
+    def include_paths(self):
+        return []
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = ['-O3'] + self.version_dependent_macros()
+        if self.is_rocm_pytorch():
+            ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
+            nvcc_flags += ['-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR, '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR]
+        else:
+            nvcc_flags.extend(
+                ['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math'] +
+                self.compute_capability_args())
+        return nvcc_flags
\ No newline at end of file
diff --git a/setup.py b/setup.py
index c4044a0a3..96b2d8a1d 100644
--- a/setup.py
+++ b/setup.py
@@ -17,12 +17,23 @@
         load,
      )
 
+import typing
+import shlex
+
+sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
+
+from op_builder import get_default_compute_capabilities, OpBuilder
+from op_builder.all_ops import ALL_OPS, accelerator_name
+from op_builder.builder import installed_cuda_version
+
+from accelerator import get_accelerator
 
 # ninja build does not work unless include_dirs are abs path
 this_dir = os.path.dirname(os.path.abspath(__file__))
 torch_dir = torch.__path__[0]
 
 
+
 # https://github.com/pytorch/pytorch/pull/71881
 # For the extensions which have rocblas_gemm_flags_fp16_alt_impl we need to make sure if at::BackwardPassGuard exists.
 # It helps the extensions be backward compatible with old PyTorch versions.
@@ -237,748 +248,154 @@ def check_if_rocm_pytorch():
                            "found torch.__version__ = {}".format(torch.__version__)
                            )
 
-if "--cpp_ext" in sys.argv:
-    sys.argv.remove("--cpp_ext")
-    ext_modules.append(CppExtension("apex_C", ["csrc/flatten_unflatten.cpp"]))
-
-if "--distributed_adam" in sys.argv or "--cuda_ext" in sys.argv:
-    if "--distributed_adam" in sys.argv:
-        sys.argv.remove("--distributed_adam")
-
-    raise_if_home_none("--distributed_adam")
-    nvcc_args_adam = ['-O3', '--use_fast_math'] + version_dependent_macros
-    hipcc_args_adam = ['-O3'] + version_dependent_macros
-    ext_modules.append(
-        CUDAExtension(
-            name='distributed_adam_cuda',
-            sources=[
-                'apex/contrib/csrc/optimizers/multi_tensor_distopt_adam.cpp',
-                'apex/contrib/csrc/optimizers/multi_tensor_distopt_adam_kernel.cu',
-            ],
-            include_dirs=[
-                os.path.join(this_dir, 'csrc'),
-                os.path.join(this_dir, 'apex/contrib/csrc/optimizers'),
-            ],
-            extra_compile_args={
-                'cxx': ['-O3',] + version_dependent_macros,
-                'nvcc':nvcc_args_adam if not IS_ROCM_PYTORCH else hipcc_args_adam,
-            }
-        )
-    )
-
-if "--distributed_lamb" in sys.argv or "--cuda_ext" in sys.argv:
-    if "--distributed_lamb" in sys.argv:
-        sys.argv.remove("--distributed_lamb")
-
-    raise_if_home_none("--distributed_lamb")
-
-    print ("INFO: Building the distributed_lamb extension.")
-    nvcc_args_distributed_lamb = ['-O3', '--use_fast_math'] + version_dependent_macros
-    hipcc_args_distributed_lamb = ['-O3'] + version_dependent_macros
-    ext_modules.append(
-        CUDAExtension(
-            name='distributed_lamb_cuda',
-            sources=[
-                'apex/contrib/csrc/optimizers/multi_tensor_distopt_lamb.cpp',
-                'apex/contrib/csrc/optimizers/multi_tensor_distopt_lamb_kernel.cu',
-            ],
-            include_dirs=[os.path.join(this_dir, 'csrc')],
-            extra_compile_args={
-                'cxx': ['-O3',] + version_dependent_macros,
-                'nvcc': nvcc_args_distributed_lamb if not IS_ROCM_PYTORCH else hipcc_args_distributed_lamb,
-                }
-            )
-        )
-
-    
-if "--cuda_ext" in sys.argv:
-    raise_if_home_none("--cuda_ext")
-    
-    if not IS_ROCM_PYTORCH:
-        check_cuda_torch_binary_vs_bare_metal(CUDA_HOME)
-    else:
-        check_rocm_torch_binary_vs_bare_metal(ROCM_HOME)
-
-#**********  multi-tensor apply  ****************
-    print ("INFO: Building the multi-tensor apply extension.")
-    nvcc_args_multi_tensor = ['-lineinfo', '-O3', '--use_fast_math'] + version_dependent_macros
-    hipcc_args_multi_tensor = ['-O3'] + version_dependent_macros
-    ext_modules.append(
-        CUDAExtension(
-            name='amp_C',
-            sources=[
-                'csrc/amp_C_frontend.cpp',
-                'csrc/multi_tensor_sgd_kernel.cu',
-                'csrc/multi_tensor_scale_kernel.cu',
-                'csrc/multi_tensor_axpby_kernel.cu',
-                'csrc/multi_tensor_l2norm_kernel.cu',
-                'csrc/multi_tensor_l2norm_kernel_mp.cu',
-                'csrc/multi_tensor_l2norm_scale_kernel.cu',
-                'csrc/multi_tensor_lamb_stage_1.cu',
-                'csrc/multi_tensor_lamb_stage_2.cu',
-                'csrc/multi_tensor_adam.cu',
-                'csrc/multi_tensor_adagrad.cu',
-                'csrc/multi_tensor_novograd.cu',
-                'csrc/multi_tensor_lars.cu',
-                'csrc/multi_tensor_lamb.cu',
-                'csrc/multi_tensor_lamb_mp.cu'],
-            include_dirs=[os.path.join(this_dir, 'csrc')],
-            extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
-                                'nvcc': nvcc_args_multi_tensor if not IS_ROCM_PYTORCH else hipcc_args_multi_tensor,
-                                }
-            )
-        )
-
-#**********  syncbn  ****************
-    print("INFO: Building syncbn extension.")
-    ext_modules.append(
-        CUDAExtension(
-            name='syncbn',
-            sources=[
-                'csrc/syncbn.cpp',
-                'csrc/welford.cu',
-            ],
-            include_dirs=[os.path.join(this_dir, 'csrc')],
-            extra_compile_args={
-                'cxx': ['-O3'] + version_dependent_macros,
-                'nvcc':['-O3'] + version_dependent_macros,
-                }
-            )
-        )
-
-#**********  fused layernorm  ****************
-    nvcc_args_layer_norm = ['-maxrregcount=50', '-O3', '--use_fast_math'] + version_dependent_macros
-    hipcc_args_layer_norm = ['-O3'] + version_dependent_macros
-
-    print ("INFO: Building fused layernorm extension.")
-    ext_modules.append(
-        CUDAExtension(
-            name='fused_layer_norm_cuda',
-            sources=[
-                'csrc/layer_norm_cuda.cpp',
-                'csrc/layer_norm_cuda_kernel.cu',
-            ],
-            include_dirs=[os.path.join(this_dir, 'csrc')],
-            extra_compile_args={
-                'cxx': ['-O3'] + version_dependent_macros,
-                'nvcc': nvcc_args_layer_norm if not IS_ROCM_PYTORCH else hipcc_args_layer_norm,
-                }
-            )
-        )
 
-#**********  fused dense  ****************
-    ext_modules.append(
-        CUDAExtension(
-            name='fused_dense_cuda',
-            sources=[
-                'csrc/fused_dense_base.cpp',
-                'csrc/fused_dense_cuda.cu',
-            ],
-            extra_compile_args={
-                'cxx': ['-O3'] + version_dependent_macros,
-                'nvcc':['-O3'] + version_dependent_macros
-                }
-            )
-        )
-    
-    bare_metal_version = Version(bare_metal_version)
-    print("Bare Metal Version : ", bare_metal_version)
-    if True:
-
-        cc_flag = []
-        cc_flag.append("-gencode")
-        cc_flag.append("arch=compute_70,code=sm_70")
-        cc_flag.append("-gencode")
-        cc_flag.append("arch=compute_80,code=sm_80")
-        if bare_metal_version >= Version("11.1"):
-            cc_flag.append("-gencode")
-            cc_flag.append("arch=compute_86,code=sm_86")
-        if bare_metal_version >= Version("11.8"):
-            cc_flag.append("-gencode")
-            cc_flag.append("arch=compute_90,code=sm_90")
-
-        nvcc_args_fused_weight_gradient =  [
-                        "-O3",
-                        "-U__CUDA_NO_HALF_OPERATORS__",
-                        "-U__CUDA_NO_HALF_CONVERSIONS__",
-                        "--expt-relaxed-constexpr",
-                        "--expt-extended-lambda",
-                        "--use_fast_math",
-                    ] + version_dependent_macros + cc_flag
-
-        hipcc_args_fused_weight_gradient = [
-                        "-O3",
-                        "-U__CUDA_NO_HALF_OPERATORS__",
-                        "-U__CUDA_NO_HALF_CONVERSIONS__"
-                    ] + version_dependent_macros
-
-        ext_modules.append(
-            CUDAExtension(
-                name="fused_weight_gradient_mlp_cuda",
-                include_dirs=[os.path.join(this_dir, "csrc")],
-                sources=[
-                    "csrc/megatron/fused_weight_gradient_dense.cpp",
-                    "csrc/megatron/fused_weight_gradient_dense_cuda.cu",
-                    "csrc/megatron/fused_weight_gradient_dense_16bit_prec_cuda.cu",
-                ],
-                extra_compile_args={
-                    "cxx": ["-O3"] + version_dependent_macros,
-                    "nvcc": nvcc_args_fused_weight_gradient if not IS_ROCM_PYTORCH else hipcc_args_fused_weight_gradient,
-                },
-            )
-        )
-#**********  mlp_cuda  ****************
-    hipcc_args_mlp = ['-O3'] + version_dependent_macros
-    if found_Backward_Pass_Guard:
-        hipcc_args_mlp = hipcc_args_mlp + ['-DBACKWARD_PASS_GUARD'] + ['-DBACKWARD_PASS_GUARD_CLASS=BackwardPassGuard']
-    if found_ROCmBackward_Pass_Guard:
-        hipcc_args_mlp = hipcc_args_mlp + ['-DBACKWARD_PASS_GUARD'] + ['-DBACKWARD_PASS_GUARD_CLASS=ROCmBackwardPassGuard']
-
-    print ("INFO: Building the MLP Extension.")
-    ext_modules.append(
-        CUDAExtension(
-            name='mlp_cuda',
-            sources=[
-                'csrc/mlp.cpp',
-                'csrc/mlp_cuda.cu',
-            ],
-            include_dirs=[os.path.join(this_dir, 'csrc')],
-            extra_compile_args={
-                'cxx': ['-O3'] + version_dependent_macros,
-                'nvcc':['-O3'] + version_dependent_macros if not IS_ROCM_PYTORCH else hipcc_args_mlp,
-                }
-            )
-        )
+# ***************************** Op builder **********************
 
-#**********  scaled_upper_triang_masked_softmax_cuda  ****************
-    nvcc_args_transformer = ['-O3',
-                             '-U__CUDA_NO_HALF_OPERATORS__',
-                             '-U__CUDA_NO_HALF_CONVERSIONS__',
-                             '--expt-relaxed-constexpr',
-                             '--expt-extended-lambda'] + version_dependent_macros
-    hipcc_args_transformer = ['-O3',
-                              '-U__CUDA_NO_HALF_OPERATORS__',
-                              '-U__CUDA_NO_HALF_CONVERSIONS__'] + version_dependent_macros
-
-    ext_modules.append(
-        CUDAExtension(
-            name='scaled_upper_triang_masked_softmax_cuda',
-            sources=[
-                 'csrc/megatron/scaled_upper_triang_masked_softmax_cpu.cpp',
-                 'csrc/megatron/scaled_upper_triang_masked_softmax_cuda.cu',
-             ],
-             include_dirs=[os.path.join(this_dir, 'csrc')],
-             extra_compile_args={
-                'cxx': ['-O3'] + version_dependent_macros,
-                'nvcc':nvcc_args_transformer if not IS_ROCM_PYTORCH else hipcc_args_transformer,
-                 }
-             )
-        )
-#*********** generic_scaled_masked_softmax_cuda   ****************
-    ext_modules.append(
-        CUDAExtension(
-            name="generic_scaled_masked_softmax_cuda",
-            sources=[
-                "csrc/megatron/generic_scaled_masked_softmax_cpu.cpp",
-                "csrc/megatron/generic_scaled_masked_softmax_cuda.cu",
-            ],
-            include_dirs=[os.path.join(this_dir, "csrc")],
-            extra_compile_args={
-                "cxx": ["-O3"] + version_dependent_macros,
-                "nvcc": nvcc_args_transformer if not IS_ROCM_PYTORCH else hipcc_args_transformer, 
-            },
-        )
-    )
+def get_env_if_set(key, default: typing.Any = ""):
+    """
+    Returns an environment variable if it is set and not "",
+    otherwise returns a default value. In contrast, the fallback
+    parameter of os.environ.get() is skipped if the variable is set to "".
+    """
+    return os.environ.get(key, None) or default
 
+def command_exists(cmd):
+    if sys.platform == "win32":
+        safe_cmd = shlex.split(f'{cmd}')
+        result = subprocess.Popen(safe_cmd, stdout=subprocess.PIPE)
+        return result.wait() == 1
+    else:
+        safe_cmd = shlex.split(f"bash -c type {cmd}")
+        result = subprocess.Popen(safe_cmd, stdout=subprocess.PIPE)
+        return result.wait() == 0
 
-#*********** scaled_masked_softmax_cuda   ****************
-    ext_modules.append(
-        CUDAExtension(
-            name='scaled_masked_softmax_cuda',
-            sources=[
-                'csrc/megatron/scaled_masked_softmax_cpu.cpp',
-                'csrc/megatron/scaled_masked_softmax_cuda.cu',
-            ],
-            include_dirs=[os.path.join(this_dir, 'csrc'),
-                          os.path.join(this_dir, 'csrc/megatron')],
-            extra_compile_args={
-                'cxx': ['-O3'] + version_dependent_macros,
-                'nvcc':nvcc_args_transformer if not IS_ROCM_PYTORCH else hipcc_args_transformer,
-                }
-            )
-        )
 
-#***********  scaled_softmax_cuda   ****************
-    ext_modules.append(
-        CUDAExtension(
-            name="scaled_softmax_cuda",
-            sources=[
-                "csrc/megatron/scaled_softmax_cpu.cpp", 
-                "csrc/megatron/scaled_softmax_cuda.cu",
-            ],
-            include_dirs=[os.path.join(this_dir, "csrc")],
-            extra_compile_args={
-                "cxx": ["-O3"] + version_dependent_macros,
-                "nvcc":nvcc_args_transformer if not IS_ROCM_PYTORCH else hipcc_args_transformer,
-                }
-            )
-        )
+BUILD_OP_PLATFORM = 1 if sys.platform == "win32" else 0
+BUILD_OP_DEFAULT = int(get_env_if_set('DS_BUILD_OPS', BUILD_OP_PLATFORM))
+print(f"DS_BUILD_OPS={BUILD_OP_DEFAULT}")
 
-#***********  fused_rotary_positional_embedding   ****************
-    if IS_ROCM_PYTORCH and "--aiter" in sys.argv:
-        sys.argv.remove("--aiter")
-        subprocess.run(["pip", "install", "."], cwd = "third_party/aiter")
-
-    ext_modules.append(
-        CUDAExtension(
-            name="fused_rotary_positional_embedding",
-            sources=[
-                "csrc/megatron/fused_rotary_positional_embedding.cpp",
-                "csrc/megatron/fused_rotary_positional_embedding_cuda.cu",
-            ],
-            include_dirs=[os.path.join(this_dir, "csrc")],
-            extra_compile_args={
-                "cxx": ["-O3"] + version_dependent_macros,
-                "nvcc":nvcc_args_transformer if not IS_ROCM_PYTORCH else hipcc_args_transformer,
-                }
-            )
-        )
+ext_modules2 = []
 
-#***********  fused_bias_swiglu   ****************
-    nvcc_args_swiglu = ['-O3',
-                        '-U__CUDA_NO_HALF_OPERATORS__',
-                        '-U__CUDA_NO_HALF_CONVERSIONS__',
-                        '--expt-relaxed-constexpr',
-                        '--expt-extended-lambda'] + version_dependent_macros
-    hipcc_args_swiglu = ['-O3',
-                        '-U__CUDA_NO_HALF_OPERATORS__',
-                        '-U__CUDA_NO_HALF_CONVERSIONS__'] + version_dependent_macros
-
-    if IS_ROCM_PYTORCH:
-        try:
-            amdgpu_targets = os.environ.get('PYTORCH_ROCM_ARCH', '')
-            if not amdgpu_targets:
-                print("Warning: PYTORCH_ROCM_ARCH environment variable is empty.")
-                print("Using default architecture. Set this variable for specific GPU targets.")
-                print("Example: export PYTORCH_ROCM_ARCH=gfx906")
-                amdgpu_targets = "gfx906"  # Default to a common architecture
-                
-            # Handle multiple architectures (separated by semicolons)
-            for amdgpu_target in amdgpu_targets.split(';'):
-                if amdgpu_target:  # Skip empty strings
-                    hipcc_args_swiglu += [f'--offload-arch={amdgpu_target}']
-        except Exception as e:
-            print(f"Warning: Error processing PYTORCH_ROCM_ARCH: {e}")
-            print("Falling back to default architecture gfx906")
-            hipcc_args_swiglu += ['--offload-arch=gfx906']
-
-
-    ext_modules.append(
-        CUDAExtension(
-            name="fused_bias_swiglu",
-            sources=[
-                "csrc/megatron/fused_bias_swiglu.cpp",
-                "csrc/megatron/fused_bias_swiglu_cuda.cu",
-            ],
-            include_dirs=[os.path.join(this_dir, "csrc")],
-            extra_compile_args={
-                "cxx": ["-O3"] + version_dependent_macros,
-                "nvcc": nvcc_args_swiglu if not IS_ROCM_PYTORCH else hipcc_args_swiglu,
-            }
-        )
-    )
+def is_env_set(key):
+    """
+    Checks if an environment variable is set and not "".
+    """
+    return bool(os.environ.get(key, None))
 
-if "--bnp" in sys.argv or "--cuda_ext" in sys.argv:
+def op_envvar(op_name):
+    assert hasattr(ALL_OPS[op_name], 'BUILD_VAR'), \
+        f"{op_name} is missing BUILD_VAR field"
+    return ALL_OPS[op_name].BUILD_VAR
 
-    if "--bnp" in sys.argv:
-        sys.argv.remove("--bnp")
 
-    if torch.utils.cpp_extension.CUDA_HOME is None and not IS_ROCM_PYTORCH:
-        raise RuntimeError("--bnp was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
-    else:
-        ext_modules.append(
-            CUDAExtension(name='bnp',
-                          sources=['apex/contrib/csrc/groupbn/batch_norm.cu',
-                                   'apex/contrib/csrc/groupbn/ipc.cu',
-                                   'apex/contrib/csrc/groupbn/interface.cpp',
-                                   'apex/contrib/csrc/groupbn/batch_norm_add_relu.cu'],
-                          include_dirs=[os.path.join(this_dir, 'csrc'),
-                                        os.path.join(this_dir, 'apex/contrib/csrc/groupbn')],
-                          extra_compile_args={'cxx': [] + version_dependent_macros,
-                                              'nvcc':['-DCUDA_HAS_FP16=1',
-                                                      '-D__CUDA_NO_HALF_OPERATORS__',
-                                                      '-D__CUDA_NO_HALF_CONVERSIONS__',
-                                                      '-D__CUDA_NO_HALF2_OPERATORS__'] + version_dependent_macros}))
-
-if "--xentropy" in sys.argv or "--cuda_ext" in sys.argv:
-    if "--xentropy" in sys.argv:
-        sys.argv.remove("--xentropy")
-
-    if torch.utils.cpp_extension.CUDA_HOME is None and not IS_ROCM_PYTORCH:
-        raise RuntimeError("--xentropy was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
-    else:
-        print ("INFO: Building the xentropy extension.")
-        ext_modules.append(
-            CUDAExtension(name='xentropy_cuda',
-                          sources=['apex/contrib/csrc/xentropy/interface.cpp',
-                                   'apex/contrib/csrc/xentropy/xentropy_kernel.cu'],
-                          include_dirs=[os.path.join(this_dir, 'csrc'),
-                                        os.path.join(this_dir, 'apex/contrib/csrc/xentropy')],
-                          extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
-                                              'nvcc':['-O3'] + version_dependent_macros}))
-
-if "--focal_loss" in sys.argv or "--cuda_ext" in sys.argv:
-    if "--focal_loss" in sys.argv:
-        sys.argv.remove("--focal_loss")
-    ext_modules.append(
-        CUDAExtension(
-            name='focal_loss_cuda',
-            sources=[
-                'apex/contrib/csrc/focal_loss/focal_loss_cuda.cpp',
-                'apex/contrib/csrc/focal_loss/focal_loss_cuda_kernel.cu',
-            ],
-            include_dirs=[os.path.join(this_dir, 'csrc')],
-            extra_compile_args={
-                'cxx': ['-O3'] + version_dependent_macros,
-                'nvcc':(['-O3', '--use_fast_math', '--ftz=false'] if not IS_ROCM_PYTORCH else ['-O3']) + version_dependent_macros,
-            },
-        )
-    )
+def op_enabled(op_name):
+    env_var = op_envvar(op_name)
+    return int(get_env_if_set(env_var, BUILD_OP_DEFAULT))
 
-if "--index_mul_2d" in sys.argv or "--cuda_ext" in sys.argv:
-    if "--index_mul_2d" in sys.argv:
-        sys.argv.remove("--index_mul_2d")
+install_ops = dict.fromkeys(ALL_OPS.keys(), False)
+for op_name, builder in ALL_OPS.items():
+    op_compatible = builder.is_compatible()
 
-    args_index_mul_2d = ['-O3']
-    if not IS_ROCM_PYTORCH:
-        args_index_mul_2d += ['--use_fast_math', '--ftz=false']
-    if found_aten_atomic_header:
-        args_index_mul_2d += ['-DATEN_ATOMIC_HEADER']
-
-    ext_modules.append(
-        CUDAExtension(
-            name='fused_index_mul_2d',
-            sources=[
-                'apex/contrib/csrc/index_mul_2d/index_mul_2d_cuda.cpp',
-                'apex/contrib/csrc/index_mul_2d/index_mul_2d_cuda_kernel.cu',
-            ],
-            include_dirs=[os.path.join(this_dir, 'csrc')],
-            extra_compile_args={
-                'cxx': ['-O3'] + version_dependent_macros,
-                'nvcc': args_index_mul_2d + version_dependent_macros,
-            },
-        )
-    )
+    # If op is requested but not available, throw an error.
+    if op_enabled(op_name) and not op_compatible:
+        env_var = op_envvar(op_name)
+        if not is_env_set(env_var):
+            builder.warning(f"Skip pre-compile of incompatible {op_name}; One can disable {op_name} with {env_var}=0")
+        continue
 
-if "--deprecated_fused_adam" in sys.argv or "--cuda_ext" in sys.argv:
-    if "--deprecated_fused_adam" in sys.argv:
-        sys.argv.remove("--deprecated_fused_adam")
+    # If op is compatible but install is not enabled (JIT mode).
+    if IS_ROCM_PYTORCH and op_compatible and not op_enabled(op_name):
+        builder.hipify_extension()
 
-    if torch.utils.cpp_extension.CUDA_HOME is None and not IS_ROCM_PYTORCH:
-        raise RuntimeError("--deprecated_fused_adam was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
-    else:
-        print ("INFO: Building deprecated fused adam extension.")
-        nvcc_args_fused_adam = ['-O3', '--use_fast_math'] + version_dependent_macros
-        hipcc_args_fused_adam = ['-O3'] + version_dependent_macros
-        ext_modules.append(
-            CUDAExtension(name='fused_adam_cuda',
-                          sources=['apex/contrib/csrc/optimizers/fused_adam_cuda.cpp',
-                                   'apex/contrib/csrc/optimizers/fused_adam_cuda_kernel.cu'],
-                          include_dirs=[os.path.join(this_dir, 'csrc'),
-                                        os.path.join(this_dir, 'apex/contrib/csrc/optimizers')],
-                          extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
-                                              'nvcc' : nvcc_args_fused_adam if not IS_ROCM_PYTORCH else hipcc_args_fused_adam}))
-
-if "--deprecated_fused_lamb" in sys.argv or "--cuda_ext" in sys.argv:
-    if "--deprecated_fused_lamb" in sys.argv:
-        sys.argv.remove("--deprecated_fused_lamb")
-
-    if torch.utils.cpp_extension.CUDA_HOME is None and not IS_ROCM_PYTORCH:
-        raise RuntimeError("--deprecated_fused_lamb was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
-    else:
-        print ("INFO: Building deprecated fused lamb extension.")
-        nvcc_args_fused_lamb = ['-O3', '--use_fast_math'] + version_dependent_macros
-        hipcc_args_fused_lamb = ['-O3'] + version_dependent_macros
-        ext_modules.append(
-            CUDAExtension(name='fused_lamb_cuda',
-                          sources=['apex/contrib/csrc/optimizers/fused_lamb_cuda.cpp',
-                                   'apex/contrib/csrc/optimizers/fused_lamb_cuda_kernel.cu',
-                                   'csrc/multi_tensor_l2norm_kernel.cu'],
-                          include_dirs=[os.path.join(this_dir, 'csrc')],
-                          extra_compile_args = nvcc_args_fused_lamb if not IS_ROCM_PYTORCH else hipcc_args_fused_lamb))
-
-# Check, if ATen/CUDAGeneratorImpl.h is found, otherwise use ATen/cuda/CUDAGeneratorImpl.h
-# See https://github.com/pytorch/pytorch/pull/70650
-generator_flag = []
-torch_dir = torch.__path__[0]
-if os.path.exists(os.path.join(torch_dir, "include", "ATen", "CUDAGeneratorImpl.h")):
-    generator_flag = ["-DOLD_GENERATOR_PATH"]
-
-if "--fast_layer_norm" in sys.argv:
-    sys.argv.remove("--fast_layer_norm")
-    raise_if_cuda_home_none("--fast_layer_norm")
-    # Check, if CUDA11 is installed for compute capability 8.0
-    cc_flag = []
-    _, bare_metal_major, _ = get_cuda_bare_metal_version(CUDA_HOME)
-    if int(bare_metal_major) >= 11:
-        cc_flag.append("-gencode")
-        cc_flag.append("arch=compute_80,code=sm_80")
-
-    if CUDA_HOME is None and not IS_ROCM_PYTORCH:
-        raise RuntimeError("--fast_layer_norm was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
-    else:
-        # Check, if CUDA11 is installed for compute capability 8.0
-        cc_flag = []
-        _, bare_metal_major, _ = get_cuda_bare_metal_version(CUDA_HOME)
-        if int(bare_metal_major) >= 11:
-            cc_flag.append('-gencode')
-            cc_flag.append('arch=compute_80,code=sm_80')
-
-if "--fmha" in sys.argv:
-    sys.argv.remove("--fmha")
-    raise_if_cuda_home_none("--fmha")
-    # Check, if CUDA11 is installed for compute capability 8.0
-    cc_flag = []
-    _, bare_metal_major, _ = get_cuda_bare_metal_version(CUDA_HOME)
-    if int(bare_metal_major) < 11:
-        raise RuntimeError("--fmha only supported on SM80")
-    cc_flag.append("-gencode")
-    cc_flag.append("arch=compute_80,code=sm_80")
-
-    if CUDA_HOME is None and not IS_ROCM_PYTORCH:
-        raise RuntimeError("--fmha was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
-    else:
-        # Check, if CUDA11 is installed for compute capability 8.0
-        cc_flag = []
-        _, bare_metal_major, _ = get_cuda_bare_metal_version(CUDA_HOME)
-        if int(bare_metal_major) < 11:
-            raise RuntimeError("--fmha only supported on SM80")
-
-        ext_modules.append(
-            CUDAExtension(name='fmhalib',
-                          sources=[
-                                   'apex/contrib/csrc/fmha/fmha_api.cpp',
-                                   'apex/contrib/csrc/fmha/src/fmha_noloop_reduce.cu',
-                                   'apex/contrib/csrc/fmha/src/fmha_fprop_fp16_128_64_kernel.sm80.cu',
-                                   'apex/contrib/csrc/fmha/src/fmha_fprop_fp16_256_64_kernel.sm80.cu',
-                                   'apex/contrib/csrc/fmha/src/fmha_fprop_fp16_384_64_kernel.sm80.cu',
-                                   'apex/contrib/csrc/fmha/src/fmha_fprop_fp16_512_64_kernel.sm80.cu',
-                                   'apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_128_64_kernel.sm80.cu',
-                                   'apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_256_64_kernel.sm80.cu',
-                                   'apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_384_64_kernel.sm80.cu',
-                                   'apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_512_64_kernel.sm80.cu',
-                                   ],
-                          extra_compile_args={'cxx': ['-O3',
-                                                      ] + version_dependent_macros + generator_flag,
-                                              'nvcc':['-O3',
-                                                      '-gencode', 'arch=compute_80,code=sm_80',
-                                                      '-U__CUDA_NO_HALF_OPERATORS__',
-                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
-                                                      '--expt-relaxed-constexpr',
-                                                      '--expt-extended-lambda',
-                                                      '--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag},
-                          include_dirs=[os.path.join(this_dir, "apex/contrib/csrc"), os.path.join(this_dir, "apex/contrib/csrc/fmha/src")]))
-
-
-if "--fast_multihead_attn" in sys.argv or "--cuda_ext" in sys.argv:
-    if "--fast_multihead_attn" in sys.argv:
-        sys.argv.remove("--fast_multihead_attn")
-
-    if torch.utils.cpp_extension.CUDA_HOME is None and not IS_ROCM_PYTORCH:
-        raise RuntimeError("--fast_multihead_attn was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
-    else:
-        # Check, if CUDA11 is installed for compute capability 8.0
-        cc_flag = []
-        if not IS_ROCM_PYTORCH:
-            _, bare_metal_major, _ = get_cuda_bare_metal_version(torch.utils.cpp_extension.CUDA_HOME)
-            if int(bare_metal_major) >= 11:
-                cc_flag.append('-gencode')
-                cc_flag.append('arch=compute_80,code=sm_80')
-                cc_flag.append('-gencode')
-                cc_flag.append('arch=compute_86,code=sm_86')
-
-        subprocess.run(["git", "submodule", "update", "--init", "apex/contrib/csrc/multihead_attn/cutlass"])
-        nvcc_args_mha = ['-O3',
-                         '-gencode',
-                         'arch=compute_70,code=sm_70',
-                         '-Iapex/contrib/csrc/multihead_attn/cutlass',
-                         '-U__CUDA_NO_HALF_OPERATORS__',
-                         '-U__CUDA_NO_HALF_CONVERSIONS__',
-                         '--expt-relaxed-constexpr',
-                         '--expt-extended-lambda',
-                         '--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag
-        hipcc_args_mha = ['-O3',
-                          '-Iapex/contrib/csrc/multihead_attn/cutlass',
-                          '-I/opt/rocm/include/hiprand',
-                          '-I/opt/rocm/include/rocrand',
-                          '-U__HIP_NO_HALF_OPERATORS__',
-                          '-U__HIP_NO_HALF_CONVERSIONS__'] + version_dependent_macros + generator_flag
-        if found_Backward_Pass_Guard:
-            hipcc_args_mha = hipcc_args_mha + ['-DBACKWARD_PASS_GUARD'] + ['-DBACKWARD_PASS_GUARD_CLASS=BackwardPassGuard']
-        if found_ROCmBackward_Pass_Guard:
-            hipcc_args_mha = hipcc_args_mha + ['-DBACKWARD_PASS_GUARD'] + ['-DBACKWARD_PASS_GUARD_CLASS=ROCmBackwardPassGuard']
-
-        ext_modules.append(
-            CUDAExtension(
-                name='fast_multihead_attn',
-                sources=[
-                    'apex/contrib/csrc/multihead_attn/multihead_attn_frontend.cpp',
-                    'apex/contrib/csrc/multihead_attn/additive_masked_softmax_dropout_cuda.cu',
-                    "apex/contrib/csrc/multihead_attn/masked_softmax_dropout_cuda.cu",
-                    "apex/contrib/csrc/multihead_attn/encdec_multihead_attn_cuda.cu",
-                    "apex/contrib/csrc/multihead_attn/encdec_multihead_attn_norm_add_cuda.cu",
-                    "apex/contrib/csrc/multihead_attn/self_multihead_attn_cuda.cu",
-                    "apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_additive_mask_cuda.cu",
-                    "apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_cuda.cu",
-                    "apex/contrib/csrc/multihead_attn/self_multihead_attn_norm_add_cuda.cu",
-                ],
-                include_dirs=[os.path.join(this_dir, 'csrc'),
-                                        os.path.join(this_dir, 'apex/contrib/csrc/multihead_attn')],
-                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros + generator_flag,
-                                              'nvcc':nvcc_args_mha if not IS_ROCM_PYTORCH else hipcc_args_mha}
-            )
-        )
+    # If op install enabled, add builder to extensions.
+    if op_enabled(op_name) and op_compatible:
+        install_ops[op_name] = op_enabled(op_name)
+        ext_modules2.append(builder.builder())
 
-if "--transducer" in sys.argv or "--cuda_ext" in sys.argv:
-    if "--transducer" in sys.argv:
-        sys.argv.remove("--transducer")
+print(f'Install Ops={install_ops}')
+    
+if "--cuda_ext" in sys.argv:
+    raise_if_home_none("--cuda_ext")
     
     if not IS_ROCM_PYTORCH:
-        raise_if_cuda_home_none("--transducer")
-
-    ext_modules.append(
-        CUDAExtension(
-            name="transducer_joint_cuda",
-            sources=[
-                "apex/contrib/csrc/transducer/transducer_joint.cpp",
-                "apex/contrib/csrc/transducer/transducer_joint_kernel.cu",
-            ],
-            extra_compile_args={
-                "cxx": ["-O3"] + version_dependent_macros + generator_flag,
-                "nvcc": append_nvcc_threads(["-O3"] + version_dependent_macros + generator_flag) if not IS_ROCM_PYTORCH
-                        else ["-O3"] + version_dependent_macros + generator_flag,
-            },
-            include_dirs=[os.path.join(this_dir, "csrc"), os.path.join(this_dir, "apex/contrib/csrc/multihead_attn")],
-        )
-    )
-    ext_modules.append(
-        CUDAExtension(
-            name="transducer_loss_cuda",
-            sources=[
-                "apex/contrib/csrc/transducer/transducer_loss.cpp",
-                "apex/contrib/csrc/transducer/transducer_loss_kernel.cu",
-            ],
-            include_dirs=[os.path.join(this_dir, "csrc")],
-            extra_compile_args={
-                "cxx": ["-O3"] + version_dependent_macros,
-                "nvcc": append_nvcc_threads(["-O3"] + version_dependent_macros) if not IS_ROCM_PYTORCH
-                        else ["-O3"] + version_dependent_macros,
-            },
-        )
-    )
-
-# note (mkozuki): Now `--fast_bottleneck` option (i.e. apex/contrib/bottleneck) depends on `--peer_memory` and `--nccl_p2p`.
-if "--fast_bottleneck" in sys.argv:
-    sys.argv.remove("--fast_bottleneck")
-    raise_if_cuda_home_none("--fast_bottleneck")
-    if check_cudnn_version_and_warn("--fast_bottleneck", 8400):
-        subprocess.run(["git", "submodule", "update", "--init", "apex/contrib/csrc/cudnn-frontend/"])
-        ext_modules.append(
-            CUDAExtension(
-                name="fast_bottleneck",
-                sources=["apex/contrib/csrc/bottleneck/bottleneck.cpp"],
-                include_dirs=[os.path.join(this_dir, "apex/contrib/csrc/cudnn-frontend/include")],
-                extra_compile_args={"cxx": ["-O3"] + version_dependent_macros + generator_flag},
-            )
-        )
-
-if "--peer_memory" in sys.argv or "--cuda_ext" in sys.argv:
-    if "--peer_memory" in sys.argv:
-        sys.argv.remove("--peer_memory")
-
-    if not IS_ROCM_PYTORCH:
-        raise_if_cuda_home_none("--peer_memory")
-
-    ext_modules.append(
-        CUDAExtension(
-            name="peer_memory_cuda",
-            sources=[
-                "apex/contrib/csrc/peer_memory/peer_memory_cuda.cu",
-                "apex/contrib/csrc/peer_memory/peer_memory.cpp",
-            ],
-            extra_compile_args={"cxx": ["-O3"] + version_dependent_macros + generator_flag},
-        )
-    )
-
-if "--nccl_p2p" in sys.argv or "--cuda_ext" in sys.argv:
-    if "--nccl_p2p" in sys.argv:
-        sys.argv.remove("--nccl_p2p")
-
-    if not IS_ROCM_PYTORCH:
-        raise_if_cuda_home_none("--nccl_p2p")
-
-    ext_modules.append(
-        CUDAExtension(
-            name="nccl_p2p_cuda",
-            sources=[
-                "apex/contrib/csrc/nccl_p2p/nccl_p2p_cuda.cu",
-                "apex/contrib/csrc/nccl_p2p/nccl_p2p.cpp",
-            ],
-            extra_compile_args={"cxx": ["-O3"] + version_dependent_macros + generator_flag},
-        )
-    )
-
-
-if "--fused_conv_bias_relu" in sys.argv:
-    sys.argv.remove("--fused_conv_bias_relu")
-    raise_if_cuda_home_none("--fused_conv_bias_relu")
-    if check_cudnn_version_and_warn("--fused_conv_bias_relu", 8400):
-        subprocess.run(["git", "submodule", "update", "--init", "apex/contrib/csrc/cudnn-frontend/"])
-        ext_modules.append(
-            CUDAExtension(
-                name="fused_conv_bias_relu",
-                sources=["apex/contrib/csrc/conv_bias_relu/conv_bias_relu.cpp"],
-                include_dirs=[os.path.join(this_dir, "apex/contrib/csrc/cudnn-frontend/include")],
-                extra_compile_args={"cxx": ["-O3"] + version_dependent_macros + generator_flag},
-            )
-        )
+        check_cuda_torch_binary_vs_bare_metal(CUDA_HOME)
+    else:
+        check_rocm_torch_binary_vs_bare_metal(ROCM_HOME)
 
-#NCCL allocator is supported for apex 1.6 version and onwards
-if TORCH_MAJOR == 2 and TORCH_MINOR >= 6:
-    if "--nccl_allocator" in sys.argv or "--cuda_ext" in sys.argv:
-        if "--nccl_allocator" in sys.argv:
-            sys.argv.remove("--nccl_allocator")
-        raise_if_cuda_home_none("--nccl_allocator")
-        _nccl_version_getter = load(
-            name="_nccl_version_getter",
-            sources=["apex/contrib/csrc/nccl_p2p/nccl_version.cpp", "apex/contrib/csrc/nccl_p2p/nccl_version_check.cu"],
-        )
-        ccl_library = ["nccl"]
-        if IS_ROCM_PYTORCH:
-            ccl_library = ["rccl"]
-        _available_nccl_version = _nccl_version_getter.get_nccl_version()
-        if _available_nccl_version >= (2, 19):
-            ext_modules.append(
-                CUDAExtension(
-                    name="_apex_nccl_allocator",
-                    sources=[
-                        "apex/contrib/csrc/nccl_allocator/NCCLAllocator.cpp",
-                    ],
-                    include_dirs=[os.path.join(this_dir, "apex/apex/contrib/csrc/nccl_allocator")],
-                    libraries=ccl_library,
-                    extra_compile_args={"cxx": ["-O3"] + version_dependent_macros + generator_flag},
-                )
-            )
+# Write out version/git info.
+git_hash_cmd = shlex.split("bash -c \"git rev-parse --short HEAD\"")
+git_branch_cmd = shlex.split("bash -c \"git rev-parse --abbrev-ref HEAD\"")
+if command_exists('git') and not is_env_set('DS_BUILD_STRING'):
+    try:
+        result = subprocess.check_output(git_hash_cmd)
+        git_hash = result.decode('utf-8').strip()
+        result = subprocess.check_output(git_branch_cmd)
+        git_branch = result.decode('utf-8').strip()
+    except subprocess.CalledProcessError:
+        git_hash = "unknown"
+        git_branch = "unknown"
+else:
+    git_hash = "unknown"
+    git_branch = "unknown"
+
+# Parse the DeepSpeed version string from version.txt.
+version_str = get_apex_version()
+
+# Build specifiers like .devX can be added at install time. Otherwise, add the git hash.
+# Example: `DS_BUILD_STRING=".dev20201022" python -m build --no-isolation`.
+
+# Building wheel for distribution, update version file.
+if is_env_set('DS_BUILD_STRING'):
+    # Build string env specified, probably building for distribution.
+    with open('build.txt', 'w') as fd:
+        fd.write(os.environ['DS_BUILD_STRING'])
+    version_str += os.environ['DS_BUILD_STRING']
+elif os.path.isfile('build.txt'):
+    # build.txt exists, probably installing from distribution.
+    with open('build.txt', 'r') as fd:
+        version_str += fd.read().strip()
+else:
+    # None of the above, probably installing from source.
+    version_str += f'+{git_hash}'
+
+torch_version = ".".join([str(TORCH_MAJOR), str(TORCH_MINOR)])
+bf16_support = False
+# Set cuda_version to 0.0 if cpu-only.
+cuda_version = "0.0"
+nccl_version = "0.0"
+# Set hip_version to 0.0 if cpu-only.
+hip_version = "0.0"
+if torch.version.cuda is not None:
+    cuda_version = ".".join(torch.version.cuda.split('.')[:2])
+    if sys.platform != "win32":
+        if isinstance(torch.cuda.nccl.version(), int):
+            # This will break if minor version > 9.
+            nccl_version = ".".join(str(torch.cuda.nccl.version())[:2])
         else:
-            warnings.warn(
-                f"Skip `--nccl_allocator` as it requires NCCL 2.19 or later, but {_available_nccl_version[0]}.{_available_nccl_version[1]}"
-            )
+            nccl_version = ".".join(map(str, torch.cuda.nccl.version()[:2]))
+    if hasattr(torch.cuda, 'is_bf16_supported') and torch.cuda.is_available():
+        bf16_support = torch.cuda.is_bf16_supported()
+if hasattr(torch.version, 'hip') and torch.version.hip is not None:
+    hip_version = ".".join(torch.version.hip.split('.')[:2])
+torch_info = {
+    "version": torch_version,
+    "bf16_support": bf16_support,
+    "cuda_version": cuda_version,
+    "nccl_version": nccl_version,
+    "hip_version": hip_version
+}
+
+print(f"version={version_str}, git_hash={git_hash}, git_branch={git_branch}")
+with open('apex/git_version_info_installed.py', 'w') as fd:
+    fd.write(f"version='{version_str}'\n")
+    fd.write(f"git_hash='{git_hash}'\n")
+    fd.write(f"git_branch='{git_branch}'\n")
+    fd.write(f"installed_ops={install_ops}\n")
+    fd.write(f"accelerator_name='{accelerator_name}'\n")
+    fd.write(f"torch_info={torch_info}\n")
 
 
 
-if "--cuda_ext" in sys.argv:
-    sys.argv.remove("--cuda_ext")
-
 with open('requirements.txt') as f:
     required = f.read().splitlines()
 
@@ -986,12 +403,15 @@ def check_if_rocm_pytorch():
     name="apex",
     version=get_apex_version(),
     packages=find_packages(
-        exclude=("build", "csrc", "include", "tests", "dist", "docs", "tests", "examples", "apex.egg-info",)
+        exclude=("build", "include", "tests", "dist", "docs", "tests", "examples", "apex.egg-info", "op_builder", "accelerator")
     ),
     description="PyTorch Extensions written by NVIDIA",
-    ext_modules=ext_modules,
-    cmdclass={'build_ext': BuildExtension} if ext_modules else {},
+    ext_modules=ext_modules2,
+    cmdclass={'build_ext': BuildExtension} if ext_modules2 else {},
     extras_require=extras,
-    install_requires=required
+    install_requires=required,
+    package_data={
+        "apex": ["csrc/**/*", "csrc/*"],  # include all files in csrc/
+    },
 )
 

From d6e0ee4e5b143e09b0ef1efb240dc955fa37302d Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 26 Jun 2025 10:56:03 +0000
Subject: [PATCH 02/79] add apex/git_version_info_installed.py to gitignore as
 it is dynamically created by setup.py for the build process

---
 .gitignore | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.gitignore b/.gitignore
index 5fe868b36..da67982aa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -148,3 +148,7 @@ cython_debug/
 *.hip
 *_hip.*
 *hip* 
+
+
+#file temporarily created for build process
+apex/git_version_info_installed.py
\ No newline at end of file

From cb0c9abb84d4f95972800463a9cf0be3ac5dcb63 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 26 Jun 2025 11:05:38 +0000
Subject: [PATCH 03/79] add code for building fused rope dynamically

---
 apex/fused_dense/fused_dense.py           |  3 --
 apex/transformer/functional/fused_rope.py | 12 ++++++--
 op_builder/fused_dense.py                 |  7 +----
 op_builder/fused_rope.py                  | 36 +++++++++++++++++++++++
 setup.py                                  |  5 +---
 5 files changed, 47 insertions(+), 16 deletions(-)
 create mode 100644 op_builder/fused_rope.py

diff --git a/apex/fused_dense/fused_dense.py b/apex/fused_dense/fused_dense.py
index f19aae6da..8f9812d20 100644
--- a/apex/fused_dense/fused_dense.py
+++ b/apex/fused_dense/fused_dense.py
@@ -1,14 +1,11 @@
 import torch
 from torch import nn
-#import fused_dense_cuda
 from apex.op_builder import FusedDenseBuilder
 from apex._autocast_utils import _cast_if_autocast_enabled
 import math 
 
-
 fused_dense_cuda = FusedDenseBuilder().load()
 
-
 #implements fused GEMM+bias in forward pass using mlp_cuda from apex
 class FusedDenseFunc(torch.autograd.Function):
     @staticmethod
diff --git a/apex/transformer/functional/fused_rope.py b/apex/transformer/functional/fused_rope.py
index e74906151..7052d91e5 100644
--- a/apex/transformer/functional/fused_rope.py
+++ b/apex/transformer/functional/fused_rope.py
@@ -50,7 +50,7 @@ def check_if_rocm_pytorch():
     except ImportError:
         AITER_ROPE_BACKEND = False
 if not AITER_ROPE_BACKEND:
-    import fused_rotary_positional_embedding
+    from apex.op_builder import FusedRopeBuilder
     warnings.warn("Using the native apex kernel for RoPE.", UserWarning)
 
 
@@ -86,6 +86,7 @@ def forward(
         freqs: torch.Tensor,
         transpose_output_memory: bool = False,
     ) -> torch.Tensor:
+        fused_rotary_positional_embedding = FusedRopeBuilder().load()
         output = fused_rotary_positional_embedding.forward(
             t, freqs, transpose_output_memory
         )
@@ -97,6 +98,7 @@ def forward(
     def backward(
         ctx, grad_output: torch.Tensor
     ) -> Tuple[Union[torch.Tensor, None], ...]:
+        fused_rotary_positional_embedding = FusedRopeBuilder().load()
         (freqs,) = ctx.saved_tensors
         grad_input = fused_rotary_positional_embedding.backward(
             grad_output, freqs, ctx.transpose_output_memory
@@ -211,6 +213,7 @@ def forward(
         sin_: torch.Tensor,
         transpose_output_memory: bool = False,
     ) -> torch.Tensor:
+        fused_rotary_positional_embedding = FusedRopeBuilder().load()
         output = fused_rotary_positional_embedding.forward_cached(
             t, cos_, sin_, transpose_output_memory
         )
@@ -223,6 +226,7 @@ def forward(
     def backward(
         ctx, grad_output: torch.Tensor
     ) -> Tuple[Union[torch.Tensor, None], ...]:
+        fused_rotary_positional_embedding = FusedRopeBuilder().load()
         cos_, sin_ = ctx.saved_tensors
         grad_input = fused_rotary_positional_embedding.backward_cached(
             grad_output, cos_, sin_, ctx.transpose_output_memory
@@ -335,6 +339,7 @@ def forward(
         cu_seqlens: torch.Tensor,
         freqs: torch.Tensor,
     ) -> torch.Tensor:
+        fused_rotary_positional_embedding = FusedRopeBuilder().load()
         output = fused_rotary_positional_embedding.forward_thd(
             t, cu_seqlens, freqs
         )
@@ -345,6 +350,7 @@ def forward(
     def backward(
         ctx, grad_output: torch.Tensor
     ) -> Tuple[Union[torch.Tensor, None], ...]:
+        fused_rotary_positional_embedding = FusedRopeBuilder().load()
         cu_seqlens, freqs = ctx.saved_tensors
         grad_input = fused_rotary_positional_embedding.backward_thd(
             grad_output, cu_seqlens, freqs
@@ -448,6 +454,7 @@ def forward(
         cos_w: torch.Tensor,
         sin_w: torch.Tensor,
     ) -> torch.Tensor:
+        fused_rotary_positional_embedding = FusedRopeBuilder().load()
         t = t.view(t.shape[0], img_h, img_w, t.shape[2], t.shape[3])
         output = fused_rotary_positional_embedding.forward_2d(
             t, cos_h, sin_h, cos_w, sin_w
@@ -461,9 +468,8 @@ def forward(
     def backward(
         ctx, grad_output: torch.Tensor
     ) -> Tuple[Union[torch.Tensor, None], ...]:
-
+        fused_rotary_positional_embedding = FusedRopeBuilder().load()
         cos_h, sin_h, cos_w, sin_w = ctx.saved_tensors
-
         grad_output = grad_output.view(
             grad_output.shape[0],
             ctx.img_h,
diff --git a/op_builder/fused_dense.py b/op_builder/fused_dense.py
index 2b8ef178f..a4c7b25f6 100644
--- a/op_builder/fused_dense.py
+++ b/op_builder/fused_dense.py
@@ -1,8 +1,3 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-
 from .builder import CUDAOpBuilder
 
 import sys
@@ -22,7 +17,7 @@ def sources(self):
         return ['csrc/fused_dense_base.cpp', 'csrc/fused_dense_cuda.cu']
 
     def include_paths(self):
-        return []
+        return ['csrc']
 
     def cxx_args(self):
         args = super().cxx_args()
diff --git a/op_builder/fused_rope.py b/op_builder/fused_rope.py
new file mode 100644
index 000000000..4397a659b
--- /dev/null
+++ b/op_builder/fused_rope.py
@@ -0,0 +1,36 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class FusedRopeBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'DS_BUILD_FUSED_ROPE'
+    NAME = "fused_rotary_positional_embedding"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ["csrc/megatron/fused_rotary_positional_embedding.cpp",
+                "csrc/megatron/fused_rotary_positional_embedding_cuda.cu"]
+
+    def include_paths(self):
+        return ['csrc']
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = ['-O3'] + self.version_dependent_macros()
+        if self.is_rocm_pytorch():
+            ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
+            nvcc_flags += ['-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR, '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR]
+        else:
+            nvcc_flags.extend(
+                ['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math'] +
+                self.compute_capability_args())
+        return nvcc_flags
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 96b2d8a1d..7a9d5cafc 100644
--- a/setup.py
+++ b/setup.py
@@ -409,9 +409,6 @@ def op_enabled(op_name):
     ext_modules=ext_modules2,
     cmdclass={'build_ext': BuildExtension} if ext_modules2 else {},
     extras_require=extras,
-    install_requires=required,
-    package_data={
-        "apex": ["csrc/**/*", "csrc/*"],  # include all files in csrc/
-    },
+    install_requires=required
 )
 

From f129b0d6dee7124dae7d0d56bd56a604a82cd804 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 26 Jun 2025 11:52:39 +0000
Subject: [PATCH 04/79] add code for building fused bias swiglu dynamically

---
 op_builder/fused_bias_swiglu.py               | 57 +++++++++++++++++++
 .../run_transformer/test_fused_bias_swiglu.py |  3 +-
 2 files changed, 59 insertions(+), 1 deletion(-)
 create mode 100644 op_builder/fused_bias_swiglu.py

diff --git a/op_builder/fused_bias_swiglu.py b/op_builder/fused_bias_swiglu.py
new file mode 100644
index 000000000..adc729e6f
--- /dev/null
+++ b/op_builder/fused_bias_swiglu.py
@@ -0,0 +1,57 @@
+from .builder import CUDAOpBuilder
+import sys
+import os
+
+class FusedBiasSwiGLUBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'DS_BUILD_FUSED_BIAS_SWIGLU'
+    NAME = "fused_bias_swiglu"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return [
+            "csrc/megatron/fused_bias_swiglu.cpp",
+            "csrc/megatron/fused_bias_swiglu_cuda.cu"
+        ]
+
+    def include_paths(self):
+        return ['csrc']
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = [
+            '-O3',
+            '-U__CUDA_NO_HALF_OPERATORS__',
+            '-U__CUDA_NO_HALF_CONVERSIONS__',
+            '--expt-relaxed-constexpr',
+            '--expt-extended-lambda'
+        ] + self.version_dependent_macros()
+        if self.is_rocm_pytorch():
+            nvcc_flags = [
+                '-O3',
+                '-U__CUDA_NO_HALF_OPERATORS__',
+                '-U__CUDA_NO_HALF_CONVERSIONS__'
+            ] + self.version_dependent_macros()
+            # Handle ROCm arch flags
+            amdgpu_targets = os.environ.get('PYTORCH_ROCM_ARCH', '')
+            if not amdgpu_targets:
+                print("Warning: PYTORCH_ROCM_ARCH environment variable is empty.")
+                print("Using default architecture. Set this variable for specific GPU targets.")
+                print("Example: export PYTORCH_ROCM_ARCH=gfx906")
+                amdgpu_targets = "gfx906"
+            try:
+                for amdgpu_target in amdgpu_targets.split(';'):
+                    if amdgpu_target:
+                        nvcc_flags += [f'--offload-arch={amdgpu_target}']
+            except Exception as e:
+                print(f"Warning: Error processing PYTORCH_ROCM_ARCH: {e}")
+                print("Falling back to default architecture gfx906")
+                nvcc_flags += ['--offload-arch=gfx906']
+        return nvcc_flags
diff --git a/tests/L0/run_transformer/test_fused_bias_swiglu.py b/tests/L0/run_transformer/test_fused_bias_swiglu.py
index e7c2e4793..66cf4c1e8 100644
--- a/tests/L0/run_transformer/test_fused_bias_swiglu.py
+++ b/tests/L0/run_transformer/test_fused_bias_swiglu.py
@@ -1,8 +1,9 @@
 import torch
-import fused_bias_swiglu
+from apex.op_builder import FusedBiasSwiGLUBuilder
 from torch.testing._internal import common_utils
 import torch.nn.functional as F
 
+fused_bias_swiglu = FusedBiasSwiGLUBuilder().load()
 
 class TestFusedBiasSwiGLU(common_utils.TestCase):
 

From be60325e0a263812ae8b2072609d28ad30434712 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Fri, 27 Jun 2025 11:24:56 +0000
Subject: [PATCH 05/79] fix the code so that fused rope and fused softmax are
 not compiled in jit mode, add csrc back to setup.py since it is not copied to
 apex wheel

---
 apex/transformer/functional/fused_softmax.py  | 14 +++----
 op_builder/fused_rope.py                      |  2 +-
 .../generic_scaled_masked_softmax_cuda.py     | 40 ++++++++++++++++++
 op_builder/scaled_masked_softmax_cuda.py      | 41 ++++++++++++++++++
 op_builder/scaled_softmax_cuda.py             | 42 +++++++++++++++++++
 ...scaled_upper_triang_masked_softmax_cuda.py | 40 ++++++++++++++++++
 setup.py                                      |  5 ++-
 7 files changed, 173 insertions(+), 11 deletions(-)
 create mode 100644 op_builder/generic_scaled_masked_softmax_cuda.py
 create mode 100644 op_builder/scaled_masked_softmax_cuda.py
 create mode 100644 op_builder/scaled_softmax_cuda.py
 create mode 100644 op_builder/scaled_upper_triang_masked_softmax_cuda.py

diff --git a/apex/transformer/functional/fused_softmax.py b/apex/transformer/functional/fused_softmax.py
index 83243ef7b..f0f936eec 100644
--- a/apex/transformer/functional/fused_softmax.py
+++ b/apex/transformer/functional/fused_softmax.py
@@ -16,7 +16,12 @@
 
 from apex._autocast_utils import _cast_if_autocast_enabled
 from apex.transformer.enums import AttnMaskType
+from apex.op_builder import ScaledSoftmaxCudaBuilder, ScaledUpperTriangMaskedSoftmaxCudaBuilder, GenericScaledMaskedSoftmaxCudaBuilder, ScaledMaskedSoftmaxCudaBuilder
 
+scaled_softmax_cuda = ScaledSoftmaxCudaBuilder().load()
+scaled_upper_triang_masked_softmax_cuda = ScaledUpperTriangMaskedSoftmaxCudaBuilder().load()
+generic_scaled_masked_softmax_cuda = GenericScaledMaskedSoftmaxCudaBuilder().load()
+scaled_masked_softmax_cuda = ScaledMaskedSoftmaxCudaBuilder().load()
 
 class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
     """
@@ -28,7 +33,6 @@ class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
 
     @staticmethod
     def forward(ctx, inputs, scale):
-        import scaled_upper_triang_masked_softmax_cuda
 
         scale_t = torch.tensor([scale])
         softmax_results = scaled_upper_triang_masked_softmax_cuda.forward(
@@ -40,7 +44,6 @@ def forward(ctx, inputs, scale):
 
     @staticmethod
     def backward(ctx, output_grads):
-        import scaled_upper_triang_masked_softmax_cuda
 
         softmax_results, scale_t = ctx.saved_tensors
         input_grads = scaled_upper_triang_masked_softmax_cuda.backward(
@@ -71,7 +74,6 @@ def scaled_upper_triang_masked_softmax(inputs, _, scale):
 class ScaledMaskedSoftmax(torch.autograd.Function):
     @staticmethod
     def forward(ctx, inputs, mask, scale):
-        import scaled_masked_softmax_cuda
 
         scale_t = torch.tensor([scale])
 
@@ -81,7 +83,6 @@ def forward(ctx, inputs, mask, scale):
 
     @staticmethod
     def backward(ctx, output_grads):
-        import scaled_masked_softmax_cuda
 
         softmax_results, scale_t = ctx.saved_tensors
 
@@ -106,7 +107,6 @@ def scaled_masked_softmax(inputs, mask, scale):
 class GenericScaledMaskedSoftmax(torch.autograd.Function):
     @staticmethod
     def forward(ctx, inputs, mask, scale):
-        import generic_scaled_masked_softmax_cuda
 
         scale_t = torch.tensor([scale])
         softmax_results = generic_scaled_masked_softmax_cuda.forward(inputs, mask, scale_t[0])
@@ -115,7 +115,6 @@ def forward(ctx, inputs, mask, scale):
 
     @staticmethod
     def backward(ctx, output_grads):
-        import generic_scaled_masked_softmax_cuda_new
 
         softmax_results, scale_t = ctx.saved_tensors
 
@@ -139,7 +138,6 @@ class ScaledSoftmax(torch.autograd.Function):
 
     @staticmethod
     def forward(ctx, inputs, scale):
-        import scaled_softmax_cuda
 
         scale_t = torch.tensor([scale])
 
@@ -151,7 +149,6 @@ def forward(ctx, inputs, scale):
 
     @staticmethod
     def backward(ctx, output_grads):
-        import scaled_softmax_cuda
 
         softmax_results, scale_t = ctx.saved_tensors
 
@@ -269,6 +266,5 @@ def forward_torch_softmax(self, input, mask):
 
     @staticmethod
     def get_batch_per_block(sq, sk, b, np):
-        import scaled_masked_softmax_cuda
 
         return scaled_masked_softmax_cuda.get_batch_per_block(sq, sk, b, np)
diff --git a/op_builder/fused_rope.py b/op_builder/fused_rope.py
index 4397a659b..5237830b2 100644
--- a/op_builder/fused_rope.py
+++ b/op_builder/fused_rope.py
@@ -18,7 +18,7 @@ def sources(self):
                 "csrc/megatron/fused_rotary_positional_embedding_cuda.cu"]
 
     def include_paths(self):
-        return ['csrc']
+        return ['csrc', 'csrc/megatron']
 
     def cxx_args(self):
         args = super().cxx_args()
diff --git a/op_builder/generic_scaled_masked_softmax_cuda.py b/op_builder/generic_scaled_masked_softmax_cuda.py
new file mode 100644
index 000000000..322d623f0
--- /dev/null
+++ b/op_builder/generic_scaled_masked_softmax_cuda.py
@@ -0,0 +1,40 @@
+from .builder import CUDAOpBuilder
+
+class GenericScaledMaskedSoftmaxCudaBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'DS_BUILD_GENERIC_SCALED_MASKED_SOFTMAX_CUDA'
+    NAME = "generic_scaled_masked_softmax_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return [
+            "csrc/megatron/generic_scaled_masked_softmax_cpu.cpp",
+            "csrc/megatron/generic_scaled_masked_softmax_cuda.cu"
+        ]
+
+    def include_paths(self):
+        return ['csrc']
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        if self.is_rocm_pytorch():
+            return [
+                '-O3',
+                '-U__CUDA_NO_HALF_OPERATORS__',
+                '-U__CUDA_NO_HALF_CONVERSIONS__'
+            ] + self.version_dependent_macros()
+        else:
+            return [
+                '-O3',
+                '-U__CUDA_NO_HALF_OPERATORS__',
+                '-U__CUDA_NO_HALF_CONVERSIONS__',
+                '--expt-relaxed-constexpr',
+                '--expt-extended-lambda'
+            ] + self.version_dependent_macros()
diff --git a/op_builder/scaled_masked_softmax_cuda.py b/op_builder/scaled_masked_softmax_cuda.py
new file mode 100644
index 000000000..eddbf0969
--- /dev/null
+++ b/op_builder/scaled_masked_softmax_cuda.py
@@ -0,0 +1,41 @@
+from .builder import CUDAOpBuilder
+
+class ScaledMaskedSoftmaxCudaBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'DS_BUILD_SCALED_MASKED_SOFTMAX_CUDA'
+    NAME = "scaled_masked_softmax_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return [
+            "csrc/megatron/scaled_masked_softmax_cpu.cpp",
+            "csrc/megatron/scaled_masked_softmax_cuda.cu"
+        ]
+
+    def include_paths(self):
+        # Both csrc and csrc/megatron are included in the original extension
+        return ['csrc', 'csrc/megatron']
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        if self.is_rocm_pytorch():
+            return [
+                '-O3',
+                '-U__CUDA_NO_HALF_OPERATORS__',
+                '-U__CUDA_NO_HALF_CONVERSIONS__'
+            ] + self.version_dependent_macros()
+        else:
+            return [
+                '-O3',
+                '-U__CUDA_NO_HALF_OPERATORS__',
+                '-U__CUDA_NO_HALF_CONVERSIONS__',
+                '--expt-relaxed-constexpr',
+                '--expt-extended-lambda'
+            ] + self.version_dependent_macros()
diff --git a/op_builder/scaled_softmax_cuda.py b/op_builder/scaled_softmax_cuda.py
new file mode 100644
index 000000000..24fa0544b
--- /dev/null
+++ b/op_builder/scaled_softmax_cuda.py
@@ -0,0 +1,42 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+class ScaledSoftmaxCudaBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'DS_BUILD_SCALED_SOFTMAX_CUDA'
+    NAME = "scaled_softmax_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return [
+            "csrc/megatron/scaled_softmax_cpu.cpp",
+            "csrc/megatron/scaled_softmax_cuda.cu"
+        ]
+
+    def include_paths(self):
+        return ['csrc']
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = [
+            '-O3',
+            '-U__CUDA_NO_HALF_OPERATORS__',
+            '-U__CUDA_NO_HALF_CONVERSIONS__',
+            '--expt-relaxed-constexpr',
+            '--expt-extended-lambda'
+        ] + self.version_dependent_macros()
+        if self.is_rocm_pytorch():
+            nvcc_flags = [
+                '-O3',
+                '-U__CUDA_NO_HALF_OPERATORS__',
+                '-U__CUDA_NO_HALF_CONVERSIONS__'
+            ] + self.version_dependent_macros()
+        return nvcc_flags
diff --git a/op_builder/scaled_upper_triang_masked_softmax_cuda.py b/op_builder/scaled_upper_triang_masked_softmax_cuda.py
new file mode 100644
index 000000000..c9f72db6f
--- /dev/null
+++ b/op_builder/scaled_upper_triang_masked_softmax_cuda.py
@@ -0,0 +1,40 @@
+from .builder import CUDAOpBuilder
+
+class ScaledUpperTriangMaskedSoftmaxCudaBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'DS_BUILD_SCALED_UPPER_TRIANG_MASKED_SOFTMAX_CUDA'
+    NAME = "scaled_upper_triang_masked_softmax_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return [
+            "csrc/megatron/scaled_upper_triang_masked_softmax_cpu.cpp",
+            "csrc/megatron/scaled_upper_triang_masked_softmax_cuda.cu"
+        ]
+
+    def include_paths(self):
+        return ['csrc']
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        if self.is_rocm_pytorch():
+            return [
+                '-O3',
+                '-U__CUDA_NO_HALF_OPERATORS__',
+                '-U__CUDA_NO_HALF_CONVERSIONS__'
+            ] + self.version_dependent_macros()
+        else:
+            return [
+                '-O3',
+                '-U__CUDA_NO_HALF_OPERATORS__',
+                '-U__CUDA_NO_HALF_CONVERSIONS__',
+                '--expt-relaxed-constexpr',
+                '--expt-extended-lambda'
+            ] + self.version_dependent_macros()
diff --git a/setup.py b/setup.py
index 7a9d5cafc..96b2d8a1d 100644
--- a/setup.py
+++ b/setup.py
@@ -409,6 +409,9 @@ def op_enabled(op_name):
     ext_modules=ext_modules2,
     cmdclass={'build_ext': BuildExtension} if ext_modules2 else {},
     extras_require=extras,
-    install_requires=required
+    install_requires=required,
+    package_data={
+        "apex": ["csrc/**/*", "csrc/*"],  # include all files in csrc/
+    },
 )
 

From 7b9276c016dd3a6764e2f4fd50d4c53d34b4f62b Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Mon, 30 Jun 2025 16:23:33 +0000
Subject: [PATCH 06/79] load the jit modules inside and this prevents them from
 building when building the wheel

---
 apex/transformer/functional/fused_softmax.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/apex/transformer/functional/fused_softmax.py b/apex/transformer/functional/fused_softmax.py
index f0f936eec..54830d50a 100644
--- a/apex/transformer/functional/fused_softmax.py
+++ b/apex/transformer/functional/fused_softmax.py
@@ -18,10 +18,6 @@
 from apex.transformer.enums import AttnMaskType
 from apex.op_builder import ScaledSoftmaxCudaBuilder, ScaledUpperTriangMaskedSoftmaxCudaBuilder, GenericScaledMaskedSoftmaxCudaBuilder, ScaledMaskedSoftmaxCudaBuilder
 
-scaled_softmax_cuda = ScaledSoftmaxCudaBuilder().load()
-scaled_upper_triang_masked_softmax_cuda = ScaledUpperTriangMaskedSoftmaxCudaBuilder().load()
-generic_scaled_masked_softmax_cuda = GenericScaledMaskedSoftmaxCudaBuilder().load()
-scaled_masked_softmax_cuda = ScaledMaskedSoftmaxCudaBuilder().load()
 
 class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
     """
@@ -34,6 +30,8 @@ class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
     @staticmethod
     def forward(ctx, inputs, scale):
 
+        scaled_upper_triang_masked_softmax_cuda = ScaledUpperTriangMaskedSoftmaxCudaBuilder().load()
+
         scale_t = torch.tensor([scale])
         softmax_results = scaled_upper_triang_masked_softmax_cuda.forward(
             inputs, scale_t[0]
@@ -45,6 +43,8 @@ def forward(ctx, inputs, scale):
     @staticmethod
     def backward(ctx, output_grads):
 
+        scaled_upper_triang_masked_softmax_cuda = ScaledUpperTriangMaskedSoftmaxCudaBuilder().load()
+
         softmax_results, scale_t = ctx.saved_tensors
         input_grads = scaled_upper_triang_masked_softmax_cuda.backward(
             output_grads, softmax_results, scale_t[0]
@@ -74,6 +74,7 @@ def scaled_upper_triang_masked_softmax(inputs, _, scale):
 class ScaledMaskedSoftmax(torch.autograd.Function):
     @staticmethod
     def forward(ctx, inputs, mask, scale):
+        scaled_masked_softmax_cuda = ScaledMaskedSoftmaxCudaBuilder().load()
 
         scale_t = torch.tensor([scale])
 
@@ -83,6 +84,7 @@ def forward(ctx, inputs, mask, scale):
 
     @staticmethod
     def backward(ctx, output_grads):
+        scaled_masked_softmax_cuda = ScaledMaskedSoftmaxCudaBuilder().load()
 
         softmax_results, scale_t = ctx.saved_tensors
 
@@ -108,6 +110,8 @@ class GenericScaledMaskedSoftmax(torch.autograd.Function):
     @staticmethod
     def forward(ctx, inputs, mask, scale):
 
+        generic_scaled_masked_softmax_cuda = GenericScaledMaskedSoftmaxCudaBuilder().load()
+
         scale_t = torch.tensor([scale])
         softmax_results = generic_scaled_masked_softmax_cuda.forward(inputs, mask, scale_t[0])
         ctx.save_for_backward(softmax_results, scale_t)
@@ -116,6 +120,8 @@ def forward(ctx, inputs, mask, scale):
     @staticmethod
     def backward(ctx, output_grads):
 
+        generic_scaled_masked_softmax_cuda = GenericScaledMaskedSoftmaxCudaBuilder().load()
+
         softmax_results, scale_t = ctx.saved_tensors
 
         input_grads = generic_scaled_masked_softmax_cuda.backward(output_grads, softmax_results, scale_t[0])
@@ -138,6 +144,7 @@ class ScaledSoftmax(torch.autograd.Function):
 
     @staticmethod
     def forward(ctx, inputs, scale):
+        scaled_softmax_cuda = ScaledSoftmaxCudaBuilder().load()
 
         scale_t = torch.tensor([scale])
 
@@ -150,6 +157,8 @@ def forward(ctx, inputs, scale):
     @staticmethod
     def backward(ctx, output_grads):
 
+        scaled_softmax_cuda = ScaledSoftmaxCudaBuilder().load()
+
         softmax_results, scale_t = ctx.saved_tensors
 
         input_grads = scaled_softmax_cuda.backward(
@@ -267,4 +276,6 @@ def forward_torch_softmax(self, input, mask):
     @staticmethod
     def get_batch_per_block(sq, sk, b, np):
 
+        scaled_masked_softmax_cuda = ScaledMaskedSoftmaxCudaBuilder().load()
+
         return scaled_masked_softmax_cuda.get_batch_per_block(sq, sk, b, np)

From eea4c0f1197592a21a65fa28f6b5e0eebccbeb59 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 3 Jul 2025 07:37:10 +0000
Subject: [PATCH 07/79] convert syncbn module to jit

---
 apex/parallel/__init__.py                     |    3 +-
 apex/parallel/optimized_sync_batchnorm.py     |    3 -
 .../optimized_sync_batchnorm_kernel.py        |    4 +-
 op_builder/syncbn.py                          |   35 +
 tests/L0/log_test.txt                         | 4959 +++++++++++++++++
 .../synced_batchnorm/single_gpu_unit_test.py  |    5 +-
 .../synced_batchnorm/test_groups.py           |    3 +-
 .../synced_batchnorm/two_gpu_unit_test.py     |    3 +-
 8 files changed, 5006 insertions(+), 9 deletions(-)
 create mode 100644 op_builder/syncbn.py
 create mode 100644 tests/L0/log_test.txt

diff --git a/apex/parallel/__init__.py b/apex/parallel/__init__.py
index 3cd7ae56e..f2368f8d8 100644
--- a/apex/parallel/__init__.py
+++ b/apex/parallel/__init__.py
@@ -12,7 +12,8 @@
 # for both the cuda-enabled and python-fallback versions, and I don't want
 # to suppress the error information.
 try:
-    import syncbn
+    from apex.op_builder import SyncBnBuilder
+    syncbn = SyncBnBuilder().load()
     from .optimized_sync_batchnorm import SyncBatchNorm
 except ImportError as err:
     from .sync_batchnorm import SyncBatchNorm
diff --git a/apex/parallel/optimized_sync_batchnorm.py b/apex/parallel/optimized_sync_batchnorm.py
index 65cf5eabf..02828578f 100644
--- a/apex/parallel/optimized_sync_batchnorm.py
+++ b/apex/parallel/optimized_sync_batchnorm.py
@@ -1,11 +1,8 @@
 import torch
 from torch.nn.modules.batchnorm import _BatchNorm
 from torch.nn import functional as F
-
-import syncbn
 from .optimized_sync_batchnorm_kernel import SyncBatchnormFunction
 
-
 class SyncBatchNorm(_BatchNorm):
     """
     synchronized batch normalization module extented from `torch.nn.BatchNormNd`
diff --git a/apex/parallel/optimized_sync_batchnorm_kernel.py b/apex/parallel/optimized_sync_batchnorm_kernel.py
index 616847149..93852e099 100644
--- a/apex/parallel/optimized_sync_batchnorm_kernel.py
+++ b/apex/parallel/optimized_sync_batchnorm_kernel.py
@@ -1,9 +1,11 @@
 import torch
 from torch.autograd.function import Function
 
-import syncbn
+from apex.op_builder import SyncBnBuilder
 from apex.parallel import ReduceOp
 
+syncbn = SyncBnBuilder().load()
+
 class SyncBatchnormFunction(Function):
 
     @staticmethod
diff --git a/op_builder/syncbn.py b/op_builder/syncbn.py
new file mode 100644
index 000000000..6869b69ea
--- /dev/null
+++ b/op_builder/syncbn.py
@@ -0,0 +1,35 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class SyncBnBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'DS_BUILD_SYNCBN'
+    NAME = "syncbn"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ['csrc/syncbn.cpp', 'csrc/welford.cu']
+
+    def include_paths(self):
+        return ['csrc']
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = ['-O3'] + self.version_dependent_macros()
+        if self.is_rocm_pytorch():
+            ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
+            nvcc_flags += ['-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR, '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR]
+        else:
+            nvcc_flags.extend(
+                ['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math'] +
+                self.compute_capability_args())
+        return nvcc_flags
\ No newline at end of file
diff --git a/tests/L0/log_test.txt b/tests/L0/log_test.txt
new file mode 100644
index 000000000..2bdda3252
--- /dev/null
+++ b/tests/L0/log_test.txt
@@ -0,0 +1,4959 @@
+test_add_param_group (test_add_param_group.TestAddParamGroup) ... ok
+test_bce_is_float_with_allow_banned (test_basic_casts.TestBannedMethods) ... ok
+test_bce_raises_by_default (test_basic_casts.TestBannedMethods) ... ok
+test_batch_norm_is_match (test_basic_casts.TestBasicCastsBFloat16) ... ok
+test_conv2d_is_bfloat16 (test_basic_casts.TestBasicCastsBFloat16) ... skipped "test doesn't currently work on ROCm stack."
+test_group_norm_is_float (test_basic_casts.TestBasicCastsBFloat16) ... ok
+test_linear_is_bfloat16 (test_basic_casts.TestBasicCastsBFloat16) ... skipped "test doesn't currently work on ROCm stack."
+test_mse_loss_is_float (test_basic_casts.TestBasicCastsBFloat16) ... ok
+test_relu_is_match (test_basic_casts.TestBasicCastsBFloat16) ... ok
+test_softmax_is_float (test_basic_casts.TestBasicCastsBFloat16) ... ok
+test_batch_norm_is_match (test_basic_casts.TestBasicCastsHalf) ... ok
+test_conv2d_is_half (test_basic_casts.TestBasicCastsHalf) ... ok
+test_group_norm_is_float (test_basic_casts.TestBasicCastsHalf) ... ok
+test_linear_is_half (test_basic_casts.TestBasicCastsHalf) ... ok
+test_mse_loss_is_float (test_basic_casts.TestBasicCastsHalf) ... ok
+test_relu_is_match (test_basic_casts.TestBasicCastsHalf) ... ok
+test_softmax_is_float (test_basic_casts.TestBasicCastsHalf) ... ok
+test_cpu_is_float (test_basic_casts.TestTensorCastsBFloat16) ... ok
+test_matmul_method_is_bfloat16 (test_basic_casts.TestTensorCastsBFloat16) ... skipped "test doesn't currently work on ROCm stack."
+test_matmul_op_is_bfloat16 (test_basic_casts.TestTensorCastsBFloat16) ... skipped "test doesn't currently work on ROCm stack."
+test_pow_method_is_float (test_basic_casts.TestTensorCastsBFloat16) ... ok
+test_pow_op_is_float (test_basic_casts.TestTensorCastsBFloat16) ... ok
+test_sum_is_float (test_basic_casts.TestTensorCastsBFloat16) ... ok
+test_cpu_is_float (test_basic_casts.TestTensorCastsHalf) ... ok
+test_matmul_method_is_half (test_basic_casts.TestTensorCastsHalf) ... ok
+test_matmul_op_is_half (test_basic_casts.TestTensorCastsHalf) ... ok
+test_pow_method_is_float (test_basic_casts.TestTensorCastsHalf) ... ok
+test_pow_op_is_float (test_basic_casts.TestTensorCastsHalf) ... ok
+test_sum_is_float (test_basic_casts.TestTensorCastsHalf) ... ok
+test_blacklist_module_bfp16_weight (test_cache.TestCache) ... ok
+test_blacklist_module_fp16_weight (test_cache.TestCache) ... ok
+test_blacklist_module_fp32_weight (test_cache.TestCache) ... ok
+test_promote_module_bfp16_weight (test_cache.TestCache) ... ok
+test_promote_module_fp16_weight (test_cache.TestCache) ... ok
+test_promote_module_fp32_weight (test_cache.TestCache) ... ok
+test_whitelist_module_bfp16_weight (test_cache.TestCache) ... ok
+test_whitelist_module_fp16_weight (test_cache.TestCache) ... ok
+test_whitelist_module_fp32_weight (test_cache.TestCache) ... ok
+test_loss_scale_decrease (test_checkpointing.TestCheckpointing) ... skipped 'Test is flaky.'
+test_restoring (test_checkpointing.TestCheckpointing) ... ok
+test_state_dict (test_checkpointing.TestCheckpointing) ... /skishore/github/pytorch/torch/utils/_device.py:100: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
+  return func(*args, **kwargs)
+ok
+test_2models2losses1optimizer (test_fused_sgd.TestMultipleModelsOptimizersLosses) ... ok
+test_2models2losses2optimizers (test_fused_sgd.TestMultipleModelsOptimizersLosses) ... ok
+test_3models2losses1optimizer (test_fused_sgd.TestMultipleModelsOptimizersLosses) ... ok
+test_3models2losses2optimizers (test_fused_sgd.TestMultipleModelsOptimizersLosses) ... ok
+test_larc_mixed_precision (test_larc.TestLARC) ... ok
+test_fuzz (test_multi_tensor_axpby.TestMultiTensorAxpby) ... ok
+test_fuzz_nhwc (test_multi_tensor_axpby.TestMultiTensorAxpby) ... ok
+test_fuzz (test_multi_tensor_l2norm.TestMultiTensorL2Norm) ... /skishore/github/apex/tests/L0/run_amp/test_multi_tensor_l2norm.py:37: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
+  a = torch.cuda.FloatTensor(sizea).fill_(self.val)
+ok
+test_fuzz (test_multi_tensor_scale.TestMultiTensorScale) ... ok
+test_2models2losses1optimizer (test_multiple_models_optimizers_losses.TestMultipleModelsOptimizersLosses) ... ok
+test_2models2losses2optimizers (test_multiple_models_optimizers_losses.TestMultipleModelsOptimizersLosses) ... ok
+test_3models2losses1optimizer (test_multiple_models_optimizers_losses.TestMultipleModelsOptimizersLosses) ... ok
+test_3models2losses2optimizers (test_multiple_models_optimizers_losses.TestMultipleModelsOptimizersLosses) ... ok
+test_cat_matches_widest (test_promotion.TestPromotionBFloat16) ... ok
+test_inplace_add_matches_self (test_promotion.TestPromotionBFloat16) ... ok
+test_inplace_exp_is_error_for_bfloat16 (test_promotion.TestPromotionBFloat16) ... ok
+test_mul_matches_widest (test_promotion.TestPromotionBFloat16) ... ok
+test_atan2_matches_widest (test_promotion.TestPromotionHalf) ... ok
+test_cat_matches_widest (test_promotion.TestPromotionHalf) ... ok
+test_inplace_add_matches_self (test_promotion.TestPromotionHalf) ... ok
+test_inplace_exp_is_error_for_half (test_promotion.TestPromotionHalf) ... ok
+test_mul_matches_widest (test_promotion.TestPromotionHalf) ... ok
+test_gru_cell_is_half (test_rnn.TestRnnCells) ... ok
+test_lstm_cell_is_half (test_rnn.TestRnnCells) ... ok
+test_rnn_cell_is_half (test_rnn.TestRnnCells) ... ok
+test_gru_is_half (test_rnn.TestRnns) ... skipped "test doesn't currently work on ROCm stack."
+test_lstm_is_half (test_rnn.TestRnns) ... skipped "test doesn't currently work on ROCm stack."
+test_rnn_is_half (test_rnn.TestRnns) ... skipped "test doesn't currently work on ROCm stack."
+test_rnn_packed_sequence (test_rnn.TestRnns) ... skipped "test doesn't currently work on ROCm stack."
+
+----------------------------------------------------------------------
+Ran 70 tests in 77.770s
+
+OK (skipped=9)
+test_output_is_half (test_fp16util.TestFP16Model) ... ok
+test_params_and_buffers (test_fp16util.TestFP16Model) ... ok
+
+----------------------------------------------------------------------
+Ran 2 tests in 0.577s
+
+OK
+testGradScaler (test_adam.AdamTest) ... ok
+testGradScalerCapturable (test_adam.AdamTest) ... /skishore/github/pytorch/torch/amp/grad_scaler.py:423: FutureWarning: GradScaler is going to stop passing itself as a keyword argument to the passed optimizer. In the near future GradScaler registers `grad_scale: Tensor` and `found_inf: Tensor` to the passed optimizer and let the optimizer use them directly.
+  warnings.warn(
+ok
+testGradScalerCapturableMaster (test_adam.AdamTest) ... ok
+testLargeTensor (test_adam.AdamTest) ... ok
+testNative (test_adam.AdamTest) ... ok
+test_float (test_fused_novograd.TestFusedNovoGrad) ... ok
+test_half (test_fused_novograd.TestFusedNovoGrad) ... ok
+test_multi_device (test_fused_novograd.TestFusedNovoGrad) ... ok
+test_multi_params (test_fused_novograd.TestFusedNovoGrad) ... ok
+test_adagrad_option (test_fused_optimizer.TestFusedAdagrad) ... ok
+test_float (test_fused_optimizer.TestFusedAdagrad) ... ok
+test_half (test_fused_optimizer.TestFusedAdagrad) ... skipped 'PyTorch optimizer is not numerically correct for fp16'
+test_multi_device (test_fused_optimizer.TestFusedAdagrad) ... ok
+test_multi_params (test_fused_optimizer.TestFusedAdagrad) ... ok
+test_multi_params_different_devices_throws (test_fused_optimizer.TestFusedAdagrad) ... ok
+test_adam_option (test_fused_optimizer.TestFusedAdam) ... ok
+test_bfloat16 (test_fused_optimizer.TestFusedAdam) ... skipped "test doesn't currently work on ROCm stack."
+test_float (test_fused_optimizer.TestFusedAdam) ... ok
+test_fp16_output (test_fused_optimizer.TestFusedAdam) ... skipped 'No longer support output fp16 param'
+test_half (test_fused_optimizer.TestFusedAdam) ... skipped 'NaN issue observed on ROCm as of 12/1/2021. The failing unit test is introduced by a PyTorch commit sometime in between rocm/pytorch:rocm4.3.1_ubuntu18.04_py3.6_pytorch_1.9.0 and 2021/12/01. Please refer to https://github.com/ROCmSoftwarePlatform/apex/issues/63'
+test_multi_device (test_fused_optimizer.TestFusedAdam) ... ok
+test_multi_params (test_fused_optimizer.TestFusedAdam) ... skipped 'Disable until 8/1/2019 adam/adamw upstream picked'
+test_scale (test_fused_optimizer.TestFusedAdam) ... skipped 'No longer support fuse scaling'
+test_float (test_fused_optimizer.TestFusedSGD) ... ok
+test_half (test_fused_optimizer.TestFusedSGD) ... ok
+test_multi_device (test_fused_optimizer.TestFusedSGD) ... ok
+test_float (test_fused_optimizer_channels_last.TestFusedSGDChannelLast) ... ok
+test_half (test_fused_optimizer_channels_last.TestFusedSGDChannelLast) ... ok
+test_multi_device (test_fused_optimizer_channels_last.TestFusedSGDChannelLast) ... ok
+test_float (test_lamb.TestFusedLAMB) ... ok
+test_half (test_lamb.TestFusedLAMB) ... skipped 'PyTorch optimizer is not numerically correct for fp16'
+test_lamb_option (test_lamb.TestFusedLAMB) ... ok
+test_multi_device (test_lamb.TestFusedLAMB) ... ok
+test_multi_params (test_lamb.TestFusedLAMB) ... ok
+test_float (test_lamb.TestFusedMixedPrecisionLamb) ... ok
+test_half (test_lamb.TestFusedMixedPrecisionLamb) ... skipped 'PyTorch optimizer is not numerically correct for fp16'
+test_lamb_option (test_lamb.TestFusedMixedPrecisionLamb) ... ok
+test_multi_device (test_lamb.TestFusedMixedPrecisionLamb) ... skipped 'Skipped the test since it failed the accuracy test on the PyTorch as of 8/1/2022. Please refer to https://github.com/ROCmSoftwarePlatform/apex/issues/83'
+test_multi_params (test_lamb.TestFusedMixedPrecisionLamb) ... ok
+
+----------------------------------------------------------------------
+Ran 39 tests in 15.676s
+
+OK (skipped=9)
+test_autocast_fused_layer_norm_bfloat16_elementwise_affine_False_memory_efficient_False_cuda_bfloat16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... /opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/_autocast_utils.py:26: FutureWarning: `torch.cuda.amp.autocast_mode._cast(value, dtype)` is deprecated. Please use `torch.amp.autocast_mode._cast(value, 'cuda', dtype)` instead.
+  return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())
+ok
+test_autocast_fused_layer_norm_bfloat16_elementwise_affine_False_memory_efficient_True_cuda_bfloat16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_autocast_fused_layer_norm_bfloat16_elementwise_affine_True_memory_efficient_False_cuda_bfloat16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_autocast_fused_layer_norm_bfloat16_elementwise_affine_True_memory_efficient_True_cuda_bfloat16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_autocast_fused_layer_norm_float16_elementwise_affine_False_memory_efficient_False_cuda_float16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_autocast_fused_layer_norm_float16_elementwise_affine_False_memory_efficient_True_cuda_float16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_autocast_fused_layer_norm_float16_elementwise_affine_True_memory_efficient_False_cuda_float16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_autocast_fused_layer_norm_float16_elementwise_affine_True_memory_efficient_True_cuda_float16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_autocast_fused_rms_norm_bfloat16_elementwise_affine_False_memory_efficient_False_cuda_bfloat16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_autocast_fused_rms_norm_bfloat16_elementwise_affine_False_memory_efficient_True_cuda_bfloat16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_autocast_fused_rms_norm_bfloat16_elementwise_affine_True_memory_efficient_False_cuda_bfloat16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_autocast_fused_rms_norm_bfloat16_elementwise_affine_True_memory_efficient_True_cuda_bfloat16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_autocast_fused_rms_norm_float16_elementwise_affine_False_memory_efficient_False_cuda_float16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_autocast_fused_rms_norm_float16_elementwise_affine_False_memory_efficient_True_cuda_float16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_autocast_fused_rms_norm_float16_elementwise_affine_True_memory_efficient_False_cuda_float16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_autocast_fused_rms_norm_float16_elementwise_affine_True_memory_efficient_True_cuda_float16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_compile_fused_layer_norm_elementwise_affine_False_cuda (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_compile_fused_layer_norm_elementwise_affine_True_cuda (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_compile_fused_rms_norm_elementwise_affine_False_cuda (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_compile_fused_rms_norm_elementwise_affine_True_cuda (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_layer_norm_bfloat16_batch_size_16_contiguous_False_elementwise_affine_True_mixed_fused_False_bfloat16_memory_efficient_False_cuda_bfloat16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_layer_norm_bfloat16_batch_size_16_contiguous_False_elementwise_affine_True_mixed_fused_False_bfloat16_memory_efficient_True_cuda_bfloat16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_layer_norm_bfloat16_batch_size_16_contiguous_True_elementwise_affine_True_mixed_fused_False_bfloat16_memory_efficient_False_cuda_bfloat16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_layer_norm_bfloat16_batch_size_16_contiguous_True_elementwise_affine_True_mixed_fused_False_bfloat16_memory_efficient_True_cuda_bfloat16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_layer_norm_elemwise_batch_size_16_contiguous_False_elementwise_affine_True_mixed_fused_False_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_layer_norm_elemwise_batch_size_16_contiguous_False_elementwise_affine_True_mixed_fused_False_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_layer_norm_elemwise_batch_size_16_contiguous_True_elementwise_affine_True_mixed_fused_False_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_layer_norm_elemwise_batch_size_16_contiguous_True_elementwise_affine_True_mixed_fused_False_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_layer_norm_elemwise_batch_size_65536_contiguous_False_elementwise_affine_True_mixed_fused_False_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_layer_norm_elemwise_batch_size_65536_contiguous_False_elementwise_affine_True_mixed_fused_False_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_layer_norm_elemwise_batch_size_65536_contiguous_True_elementwise_affine_True_mixed_fused_False_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_layer_norm_elemwise_batch_size_65536_contiguous_True_elementwise_affine_True_mixed_fused_False_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_layer_norm_export_cuda (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_layer_norm_half_batch_size_16_contiguous_False_elementwise_affine_True_mixed_fused_False_float16_memory_efficient_False_cuda_float16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_layer_norm_half_batch_size_16_contiguous_False_elementwise_affine_True_mixed_fused_False_float16_memory_efficient_True_cuda_float16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_layer_norm_half_batch_size_16_contiguous_True_elementwise_affine_True_mixed_fused_False_float16_memory_efficient_False_cuda_float16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_layer_norm_half_batch_size_16_contiguous_True_elementwise_affine_True_mixed_fused_False_float16_memory_efficient_True_cuda_float16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_layer_norm_mixed_batch_size_16_contiguous_False_elementwise_affine_True_mixed_fused_True_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_layer_norm_mixed_batch_size_16_contiguous_False_elementwise_affine_True_mixed_fused_True_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_layer_norm_mixed_batch_size_16_contiguous_True_elementwise_affine_True_mixed_fused_True_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_layer_norm_mixed_batch_size_16_contiguous_True_elementwise_affine_True_mixed_fused_True_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_layer_norm_mixed_batch_size_65536_contiguous_False_elementwise_affine_True_mixed_fused_True_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_layer_norm_mixed_batch_size_65536_contiguous_False_elementwise_affine_True_mixed_fused_True_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_layer_norm_mixed_batch_size_65536_contiguous_True_elementwise_affine_True_mixed_fused_True_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_layer_norm_mixed_batch_size_65536_contiguous_True_elementwise_affine_True_mixed_fused_True_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_layer_norm_regular_batch_size_16_contiguous_False_elementwise_affine_False_mixed_fused_False_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_layer_norm_regular_batch_size_16_contiguous_False_elementwise_affine_False_mixed_fused_False_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_layer_norm_regular_batch_size_16_contiguous_True_elementwise_affine_False_mixed_fused_False_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_layer_norm_regular_batch_size_16_contiguous_True_elementwise_affine_False_mixed_fused_False_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_layer_norm_regular_batch_size_65536_contiguous_False_elementwise_affine_False_mixed_fused_False_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_layer_norm_regular_batch_size_65536_contiguous_False_elementwise_affine_False_mixed_fused_False_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_layer_norm_regular_batch_size_65536_contiguous_True_elementwise_affine_False_mixed_fused_False_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_layer_norm_regular_batch_size_65536_contiguous_True_elementwise_affine_False_mixed_fused_False_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_rms_export_cuda (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_rms_norm_bfloat16_batch_size_16_contiguous_False_elementwise_affine_True_mixed_fused_False_bfloat16_memory_efficient_False_cuda_bfloat16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_rms_norm_bfloat16_batch_size_16_contiguous_False_elementwise_affine_True_mixed_fused_False_bfloat16_memory_efficient_True_cuda_bfloat16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_rms_norm_bfloat16_batch_size_16_contiguous_True_elementwise_affine_True_mixed_fused_False_bfloat16_memory_efficient_False_cuda_bfloat16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_rms_norm_bfloat16_batch_size_16_contiguous_True_elementwise_affine_True_mixed_fused_False_bfloat16_memory_efficient_True_cuda_bfloat16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_rms_norm_elemwise_batch_size_16_contiguous_False_elementwise_affine_True_mixed_fused_False_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_rms_norm_elemwise_batch_size_16_contiguous_False_elementwise_affine_True_mixed_fused_False_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_rms_norm_elemwise_batch_size_16_contiguous_True_elementwise_affine_True_mixed_fused_False_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_rms_norm_elemwise_batch_size_16_contiguous_True_elementwise_affine_True_mixed_fused_False_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_rms_norm_elemwise_batch_size_65536_contiguous_False_elementwise_affine_True_mixed_fused_False_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_rms_norm_elemwise_batch_size_65536_contiguous_False_elementwise_affine_True_mixed_fused_False_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_rms_norm_elemwise_batch_size_65536_contiguous_True_elementwise_affine_True_mixed_fused_False_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_rms_norm_elemwise_batch_size_65536_contiguous_True_elementwise_affine_True_mixed_fused_False_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_rms_norm_half_batch_size_16_contiguous_False_elementwise_affine_True_mixed_fused_False_float16_memory_efficient_False_cuda_float16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_rms_norm_half_batch_size_16_contiguous_False_elementwise_affine_True_mixed_fused_False_float16_memory_efficient_True_cuda_float16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_rms_norm_half_batch_size_16_contiguous_True_elementwise_affine_True_mixed_fused_False_float16_memory_efficient_False_cuda_float16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_rms_norm_half_batch_size_16_contiguous_True_elementwise_affine_True_mixed_fused_False_float16_memory_efficient_True_cuda_float16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_rms_norm_mixed_batch_size_16_contiguous_False_elementwise_affine_True_mixed_fused_True_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_rms_norm_mixed_batch_size_16_contiguous_False_elementwise_affine_True_mixed_fused_True_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_rms_norm_mixed_batch_size_16_contiguous_True_elementwise_affine_True_mixed_fused_True_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_rms_norm_mixed_batch_size_16_contiguous_True_elementwise_affine_True_mixed_fused_True_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_rms_norm_mixed_batch_size_65536_contiguous_False_elementwise_affine_True_mixed_fused_True_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_rms_norm_mixed_batch_size_65536_contiguous_False_elementwise_affine_True_mixed_fused_True_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_rms_norm_mixed_batch_size_65536_contiguous_True_elementwise_affine_True_mixed_fused_True_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_rms_norm_mixed_batch_size_65536_contiguous_True_elementwise_affine_True_mixed_fused_True_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_rms_norm_regular_batch_size_16_contiguous_False_elementwise_affine_False_mixed_fused_False_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_rms_norm_regular_batch_size_16_contiguous_False_elementwise_affine_False_mixed_fused_False_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_rms_norm_regular_batch_size_16_contiguous_True_elementwise_affine_False_mixed_fused_False_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_rms_norm_regular_batch_size_16_contiguous_True_elementwise_affine_False_mixed_fused_False_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_rms_norm_regular_batch_size_65536_contiguous_False_elementwise_affine_False_mixed_fused_False_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_rms_norm_regular_batch_size_65536_contiguous_False_elementwise_affine_False_mixed_fused_False_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_rms_norm_regular_batch_size_65536_contiguous_True_elementwise_affine_False_mixed_fused_False_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+test_rms_norm_regular_batch_size_65536_contiguous_True_elementwise_affine_False_mixed_fused_False_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
+
+----------------------------------------------------------------------
+Ran 86 tests in 108.763s
+
+OK
+test_creation (test_mlp.TestMLP) ... ok
+test_no_bias (test_mlp.TestMLP) ... skipped 'Test is flaky.'
+test_no_grad (test_mlp.TestMLP) ... skipped 'Test is flaky.'
+test_numeric (test_mlp.TestMLP) ... skipped 'Test is flaky.'
+test_performance_half (test_mlp.TestMLP) ... ok
+test_with_bias (test_mlp.TestMLP) ... skipped 'Test is flaky.'
+
+----------------------------------------------------------------------
+Ran 6 tests in 1.272s
+
+OK (skipped=4)
+test_fused_dense (test_fused_dense.FusedDenseTest) ... ok
+test_fused_dense_gelu_dense (test_gelu.FusedDenseGeluDenseTest) ... ok
+
+----------------------------------------------------------------------
+Ran 2 tests in 0.052s
+
+OK
+test_batch_sampler_behavior (test_batch_sampler.TestBatchSamplerBehavior) ... /skishore/github/apex/tests/L0/run_amp
+
+Executing tests from /skishore/github/apex/tests/L0/run_amp
+Warning:  unscaling grads that are not FP32. Unscaling non-fp32 grads may indicate an error. When using Amp, you don't need to call .half() on your model.
+/skishore/github/apex/tests/L0/run_fp16util
+
+Executing tests from /skishore/github/apex/tests/L0/run_fp16util
+/skishore/github/apex/tests/L0/run_optimizers
+
+Executing tests from /skishore/github/apex/tests/L0/run_optimizers
+/skishore/github/apex/tests/L0/run_fused_layer_norm
+
+Executing tests from /skishore/github/apex/tests/L0/run_fused_layer_norm
+/skishore/github/apex/tests/L0/run_mlp
+
+Executing tests from /skishore/github/apex/tests/L0/run_mlp
+
+Pytorch MLP time 1.4724 ms
+C++ MLP time 0.7349 ms
+/skishore/github/apex/tests/L0/run_fused_dense
+
+Executing tests from /skishore/github/apex/tests/L0/run_fused_dense
+/skishore/github/apex/tests/L0/run_transformer
+
+Executing tests from /skishore/github/apex/tests/L0/run_transformer
+ok
+test_split_batch (test_batch_sampler.TestBatchSamplerBehavior) ... ok
+test_cross_entropy (test_cross_entropy.NcclVocabParallelCrossEntropyTest) ... [rank3]:[W603 15:57:52.202502613 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 15:57:52.204052493 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 15:57:52.206180331 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 15:57:52.628771833 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+/skishore/github/apex/tests/L0/run_transformer/test_cross_entropy.py:28: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
+  target = torch.cuda.LongTensor(size=(batch_size, seq_length)).random_(0, vocab_size)
+/skishore/github/apex/tests/L0/run_transformer/test_cross_entropy.py:28: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
+  target = torch.cuda.LongTensor(size=(batch_size, seq_length)).random_(0, vocab_size)
+/skishore/github/apex/tests/L0/run_transformer/test_cross_entropy.py:28: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
+  target = torch.cuda.LongTensor(size=(batch_size, seq_length)).random_(0, vocab_size)
+/skishore/github/apex/tests/L0/run_transformer/test_cross_entropy.py:28: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
+  target = torch.cuda.LongTensor(size=(batch_size, seq_length)).random_(0, vocab_size)
+[dist init] rank = 0, world_size = 4
+[dist init] rank = 3, world_size = 4
+[dist init] rank = 2, world_size = 4
+[dist init] rank = 1, world_size = 4
+ok
+test_cross_entropy (test_cross_entropy.UccVocabParallelCrossEntropyTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_broadcast_data (test_data.NcclBroadcastDataTest) ... [rank3]:[W603 15:58:13.144932380 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 15:58:13.146511574 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 15:58:13.150470229 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 15:58:13.156459458 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+25-06-03 15:58:15 - PID:18274 - rank:(0, 0, 0, 0) - parallel_state.py:145 - INFO - > initializing tensor model parallel with size 4
+25-06-03 15:58:15 - PID:18274 - rank:(0, 0, 0, 0) - parallel_state.py:150 - INFO - > initializing pipeline model parallel with size 1
+25-06-03 15:58:15 - PID:18274 - rank:(0, 0, 0, 0) - parallel_state.py:155 - INFO - > initializing data parallel with size 1
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/data.py:50: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
+  sizes_cuda = torch.cuda.LongTensor(sizes)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/data.py:50: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
+  sizes_cuda = torch.cuda.LongTensor(sizes)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/data.py:50: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
+  sizes_cuda = torch.cuda.LongTensor(sizes)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/data.py:50: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
+  sizes_cuda = torch.cuda.LongTensor(sizes)
+[dist init] rank = 0, world_size = 4
+[dist init] rank = 3, world_size = 4
+[dist init] rank = 2, world_size = 4
+[dist init] rank = 1, world_size = 4
+ok
+test_broadcast_data (test_data.UccBroadcastDataTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_fused_bias_swiglu (test_fused_bias_swiglu.TestFusedBiasSwiGLU) ... ok
+test_2d_forward_backward (test_fused_rope.TestFusedRoPE) ... ok
+test_forward_backward (test_fused_rope.TestFusedRoPE) ... ok
+test_thd_forward_backward (test_fused_rope.TestFusedRoPE) ... ok
+test_autocast_fused_scale_mask_softmax (test_fused_softmax.TestFusedScaleMaskSoftmax) ... /skishore/github/apex/tests/L0/run_transformer/test_fused_softmax.py:119: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(dtype=dtype):
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/_autocast_utils.py:26: FutureWarning: `torch.cuda.amp.autocast_mode._cast(value, dtype)` is deprecated. Please use `torch.amp.autocast_mode._cast(value, 'cuda', dtype)` instead.
+  return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/functional/fused_softmax.py:98: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(enabled=False):
+ok
+test_autocast_fused_upper_triangle_mask_softmax (test_fused_softmax.TestFusedScaleMaskSoftmax) ... /skishore/github/apex/tests/L0/run_transformer/test_fused_softmax.py:207: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(dtype=dtype):
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/functional/fused_softmax.py:59: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(enabled=False):
+ok
+test_fused_scale_mask_softmax (test_fused_softmax.TestFusedScaleMaskSoftmax)
+attention_scores.shape = [4, 12, 24, 24] ... ok
+test_fused_upper_triangle_mask_softmax (test_fused_softmax.TestFusedScaleMaskSoftmax)
+attn_weights.shape: [4, 12, 24, 24] ... ok
+test_affine_weight_init_column_parallel_cpu (test_layers.NcclTensorParallelLayerTest) ... Testing with data type: torch.float32
+Test succeeded for data type: torch.float32
+Testing with data type: torch.float64
+Test succeeded for data type: torch.float64
+Testing with data type: torch.float16
+Test succeeded for data type: torch.float16
+[rank1]:[W603 15:58:37.066884666 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 15:58:37.235722975 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 15:58:37.239866274 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 15:58:37.245574686 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[dist init] rank = 0, world_size = 4
+[dist init] rank = 3, world_size = 4
+[dist init] rank = 1, world_size = 4
+[dist init] rank = 2, world_size = 4
+ok
+test_affine_weight_init_column_parallel_gpu (test_layers.NcclTensorParallelLayerTest) ... [rank2]:[W603 15:58:57.652765635 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 15:58:58.377812841 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 15:58:58.434011858 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 15:58:58.436293735 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[dist init] rank = 0, world_size = 4
+[dist init] rank = 1, world_size = 4
+[dist init] rank = 3, world_size = 4
+[dist init] rank = 2, world_size = 4
+ok
+test_affine_weight_init_row_parallel_cpu (test_layers.NcclTensorParallelLayerTest) ... [rank0]:[W603 15:59:14.275444278 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 15:59:14.276935120 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 15:59:14.672555975 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 15:59:14.674465709 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[dist init] rank = 0, world_size = 4
+[dist init] rank = 3, world_size = 4
+[dist init] rank = 2, world_size = 4
+[dist init] rank = 1, world_size = 4
+ok
+test_affine_weight_init_row_parallel_gpu (test_layers.NcclTensorParallelLayerTest) ... [rank1]:[W603 15:59:29.087710480 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 15:59:30.057818801 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 15:59:30.066972767 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 15:59:30.168463263 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[dist init] rank = 0, world_size = 4
+[dist init] rank = 3, world_size = 4
+[dist init] rank = 1, world_size = 4
+[dist init] rank = 2, world_size = 4
+ok
+test_all_gather_parity (test_layers.NcclTensorParallelLayerTest) ... [rank0]:[W603 15:59:43.936977030 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 15:59:44.440270376 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 15:59:44.583908053 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 15:59:44.586542463 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+[dist init] rank = 1, world_size = 4
+[dist init] rank = 0, world_size = 4
+[dist init] rank = 3, world_size = 4
+[dist init] rank = 2, world_size = 4
+ok
+test_column_parallel_linear (test_layers.NcclTensorParallelLayerTest) ... [rank0]:[W603 16:00:01.921646084 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:00:01.978806709 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:00:01.015221152 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:00:02.338829601 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[dist init] rank = 0, world_size = 4
+[dist init] rank = 3, world_size = 4
+[dist init] rank = 1, world_size = 4
+[dist init] rank = 2, world_size = 4
+ok
+test_column_parallel_linear_async (test_layers.NcclTensorParallelLayerTest) ... [rank2]:[W603 16:00:28.003290927 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:00:28.006099365 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:00:29.187504064 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:00:29.407647177 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[dist init] rank = 0, world_size = 4
+[dist init] rank = 3, world_size = 4
+[dist init] rank = 1, world_size = 4
+[dist init] rank = 2, world_size = 4
+ok
+test_column_parallel_linear_exception (test_layers.NcclTensorParallelLayerTest) ... [rank2]:[W603 16:00:55.497104619 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:00:55.499103466 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:00:55.499710677 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:00:55.505389085 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[dist init] rank = 0, world_size = 4
+[dist init] rank = 3, world_size = 4
+[dist init] rank = 1, world_size = 4
+[dist init] rank = 2, world_size = 4
+ok
+test_column_parallel_linear_gradient_accumulation_fusion (test_layers.NcclTensorParallelLayerTest) ... [rank0]:[W603 16:01:08.438194094 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:01:08.541519924 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:01:08.588451817 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:01:08.668895046 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[dist init] rank = 0, world_size = 4
+[dist init] rank = 1, world_size = 4
+[dist init] rank = 3, world_size = 4
+[dist init] rank = 2, world_size = 4
+ok
+test_column_parallel_linear_gradient_accumulation_fusion_in_fp16 (test_layers.NcclTensorParallelLayerTest) ... [rank2]:[W603 16:01:36.871666800 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:01:36.875749479 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:01:36.881789253 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:01:37.327369926 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[dist init] rank = 0, world_size = 4
+[dist init] rank = 2, world_size = 4
+[dist init] rank = 1, world_size = 4
+[dist init] rank = 3, world_size = 4
+ok
+test_column_parallel_linear_sequence_parallel (test_layers.NcclTensorParallelLayerTest) ... [rank1]:[W603 16:02:03.137927117 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:02:03.145352918 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:02:03.157190847 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:02:03.507710155 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:547: UserWarning: `sequence_parallel_enabled` is set to `True`, but got world_size of 1
+  warnings.warn(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:547: UserWarning: `sequence_parallel_enabled` is set to `True`, but got world_size of 1
+  warnings.warn(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:547: UserWarning: `sequence_parallel_enabled` is set to `True`, but got world_size of 1
+  warnings.warn(
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:547: UserWarning: `sequence_parallel_enabled` is set to `True`, but got world_size of 1
+  warnings.warn(
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  handle = torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  handle = torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  handle = torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  handle = torch.distributed._reduce_scatter_base(
+[dist init] rank = 3, world_size = 4
+[dist init] rank = 1, world_size = 4
+[dist init] rank = 2, world_size = 4
+[dist init] rank = 0, world_size = 4
+ok
+test_parallel_embedding (test_layers.NcclTensorParallelLayerTest) ... [rank2]:[W603 16:02:32.355274319 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:02:32.358432266 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:02:32.358927070 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:02:32.815480058 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[dist init] rank = 3, world_size = 4
+[dist init] rank = 0, world_size = 4
+[dist init] rank = 1, world_size = 4
+[dist init] rank = 2, world_size = 4
+ok
+test_reduce_scatter_parity (test_layers.NcclTensorParallelLayerTest) ... [rank3]:[W603 16:02:51.824959503 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:02:51.828917257 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:02:51.871644950 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:02:51.975317575 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+/skishore/github/apex/tests/L0/run_transformer/test_layers.py:127: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  _reduce_scatter_base(
+/skishore/github/apex/tests/L0/run_transformer/test_layers.py:127: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  _reduce_scatter_base(
+/skishore/github/apex/tests/L0/run_transformer/test_layers.py:127: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  _reduce_scatter_base(
+/skishore/github/apex/tests/L0/run_transformer/test_layers.py:127: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  _reduce_scatter_base(
+[dist init] rank = 0, world_size = 4
+[dist init] rank = 3, world_size = 4
+[dist init] rank = 2, world_size = 4
+[dist init] rank = 1, world_size = 4
+ok
+test_row_parallel_linear (test_layers.NcclTensorParallelLayerTest) ... [rank1]:[W603 16:03:15.587822867 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:03:15.593311653 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:03:15.595137332 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:03:15.597665144 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[dist init] rank = 0, world_size = 4
+[dist init] rank = 3, world_size = 4
+[dist init] rank = 2, world_size = 4
+[dist init] rank = 1, world_size = 4
+ok
+test_row_parallel_linear_gradient_accumulation_fusion (test_layers.NcclTensorParallelLayerTest) ... [rank0]:[W603 16:03:46.418085342 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:03:46.426745535 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:03:46.428857389 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:03:46.679548820 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[dist init] rank = 0, world_size = 4
+[dist init] rank = 1, world_size = 4
+[dist init] rank = 2, world_size = 4
+[dist init] rank = 3, world_size = 4
+ok
+test_row_parallel_linear_gradient_accumulation_fusion_in_fp16 (test_layers.NcclTensorParallelLayerTest) ... [rank1]:[W603 16:04:17.567956750 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:04:17.569834858 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:04:17.571835267 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:04:17.572720190 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[dist init] rank = 0, world_size = 4
+[dist init] rank = 3, world_size = 4
+[dist init] rank = 1, world_size = 4
+[dist init] rank = 2, world_size = 4
+ok
+test_row_parallel_linear_sequence_parallel (test_layers.NcclTensorParallelLayerTest) ... [rank1]:[W603 16:04:43.923494574 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:04:43.927424246 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:04:43.934572735 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:04:43.936981620 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+[dist init] rank = 3, world_size = 4
+[dist init] rank = 2, world_size = 4
+[dist init] rank = 0, world_size = 4
+[dist init] rank = 1, world_size = 4
+ok
+test_affine_weight_init_column_parallel_cpu (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_affine_weight_init_column_parallel_gpu (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_affine_weight_init_row_parallel_cpu (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_affine_weight_init_row_parallel_gpu (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_all_gather_parity (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_column_parallel_linear (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_column_parallel_linear_async (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_column_parallel_linear_exception (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_column_parallel_linear_gradient_accumulation_fusion (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_column_parallel_linear_gradient_accumulation_fusion_in_fp16 (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_column_parallel_linear_sequence_parallel (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_parallel_embedding (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_reduce_scatter_parity (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_row_parallel_linear (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_row_parallel_linear_gradient_accumulation_fusion (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_row_parallel_linear_gradient_accumulation_fusion_in_fp16 (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_row_parallel_linear_sequence_parallel (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_gather (test_mapping.NcclMappingTest) ... [rank0]:[W603 16:05:13.944825889 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:05:13.947762668 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:05:13.958828241 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:05:14.500687005 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[dist init] rank = 0, world_size = 4
+[dist init] rank = 3, world_size = 4
+[dist init] rank = 1, world_size = 4
+[dist init] rank = 2, world_size = 4
+ok
+test_reduce (test_mapping.NcclMappingTest) ... [rank1]:[W603 16:05:35.365038307 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:05:35.370537609 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:05:35.371594948 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:05:35.697203236 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[dist init] rank = 0, world_size = 4
+[dist init] rank = 3, world_size = 4
+[dist init] rank = 1, world_size = 4
+[dist init] rank = 2, world_size = 4
+ok
+test_split (test_mapping.NcclMappingTest) ... [rank1]:[W603 16:05:53.691328007 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:05:53.696602195 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:05:54.608251510 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:05:54.632199522 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[dist init] rank = 0, world_size = 4
+[dist init] rank = 3, world_size = 4
+[dist init] rank = 1, world_size = 4
+[dist init] rank = 2, world_size = 4
+ok
+test_gather (test_mapping.UccMappingTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_reduce (test_mapping.UccMappingTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_split (test_mapping.UccMappingTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_constant_microbatch_calculator (test_microbatches.NcclMicrobatchCalculatorTest) ... [rank2]:[W603 16:06:08.861642985 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:06:08.862476903 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:06:08.875005176 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:06:08.876428989 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
+  warnings.warn("This function is only for unittest")
+[dist init] rank = 0, world_size = 4
+[dist init] rank = 3, world_size = 4
+[dist init] rank = 2, world_size = 4
+[dist init] rank = 1, world_size = 4
+ok
+test_dynamic_microbatch_calculator (test_microbatches.NcclMicrobatchCalculatorTest) ... [rank2]:[W603 16:06:22.945233453 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:06:22.947858800 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:06:22.949114774 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:06:23.084504080 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
+  warnings.warn("This function is only for unittest")
+[dist init] rank = 3, world_size = 4
+[dist init] rank = 0, world_size = 4
+[dist init] rank = 2, world_size = 4
+[dist init] rank = 1, world_size = 4
+ok
+test_constant_microbatch_calculator (test_microbatches.UccMicrobatchCalculatorTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_dynamic_microbatch_calculator (test_microbatches.UccMicrobatchCalculatorTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_no_interleaving_warmup (test_p2p_comm.UccP2PCommTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_send_backward_recv_backward (test_p2p_comm.UccP2PCommTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_send_forward_recv_forward (test_p2p_comm.UccP2PCommTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_initialize_model_parallel (test_parallel_state.NcclParallelStateTest) ... [rank1]:[W603 16:06:37.933031514 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:06:37.937152991 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:06:37.938813605 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:06:38.597196668 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[dist init] rank = 3, world_size = 4
+[dist init] rank = 0, world_size = 4
+[dist init] rank = 1, world_size = 4
+[dist init] rank = 2, world_size = 4
+ok
+test_initialize_model_parallel_decoder_only (test_parallel_state.NcclParallelStateTest)
+Initialize model parallelism for decoder-only Transformers like GPT-3 ... [rank0]:[W603 16:06:52.865035520 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:06:52.884775276 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:06:52.886067374 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:06:53.276617324 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[dist init] rank = 0, world_size = 4
+[dist init] rank = 3, world_size = 4
+[dist init] rank = 1, world_size = 4
+[dist init] rank = 2, world_size = 4
+ok
+test_initialize_model_parallel_with_virtual_and_split (test_parallel_state.NcclParallelStateTest) ... [rank2]:[W603 16:07:06.873261341 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:07:06.883642889 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:07:06.885682606 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:07:07.471477788 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[dist init] rank = 0, world_size = 4
+[dist init] rank = 3, world_size = 4
+[dist init] rank = 2, world_size = 4
+[dist init] rank = 1, world_size = 4
+ok
+test_initialize_model_parallel (test_parallel_state.UccParallelStateTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_initialize_model_parallel_decoder_only (test_parallel_state.UccParallelStateTest)
+Initialize model parallelism for decoder-only Transformers like GPT-3 ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_initialize_model_parallel_with_virtual_and_split (test_parallel_state.UccParallelStateTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_inference_async_pipelining_with_interleaving (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest) ... [rank5]:[W603 16:07:22.584738687 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:07:22.591937221 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:07:22.593373272 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:07:22.027355011 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:07:23.106428187 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank7]:[W603 16:07:23.145665811 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank4]:[W603 16:07:23.146369115 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank6]:[W603 16:07:23.153563473 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[dist init] rank = 4, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
+[dist init] rank = 2, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
+  warnings.warn("This function is only for unittest")
+[dist init] rank = 6, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
+[dist init] rank = 0, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
+  self.assertEqual(x.item() / microbatch_size, target_loss.item())
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
+  self.assertEqual(x.item() / microbatch_size, target_loss.item())
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 278, in test_inference_async_pipelining_with_interleaving
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-qmtFLO (size 8257920), error: No space left on device (28)
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_with_interleaving
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 6 with exit code: 10
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 278, in test_inference_async_pipelining_with_interleaving
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 276, in _forward_backward_pipelining_with_interleaving
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = p2p_communication.send_forward_recv_forward(
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 515, in send_forward_recv_forward
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.send([tensor], group_dst, tag)
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-CTs4dF (size 8257920), error: No space left on device (28)
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_with_interleaving
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 0 with exit code: 10
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 278, in test_inference_async_pipelining_with_interleaving
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-ckiWx4 (size 8257920), error: No space left on device (28)
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_with_interleaving
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 4 with exit code: 10
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 278, in test_inference_async_pipelining_with_interleaving
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-hsiffW (size 8257920), error: No space left on device (28)
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_with_interleaving
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 2 with exit code: 10
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 278, in test_inference_async_pipelining_with_interleaving
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-BZatXC (size 8257920), error: No space left on device (28)
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_with_interleaving
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 3 with exit code: 10
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 278, in test_inference_async_pipelining_with_interleaving
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-JfUoFZ (size 8257920), error: No space left on device (28)
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_with_interleaving
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 5 with exit code: 10
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 278, in test_inference_async_pipelining_with_interleaving
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-LdJXc3 (size 8257920), error: No space left on device (28)
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_with_interleaving
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 7 with exit code: 10
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 278, in test_inference_async_pipelining_with_interleaving
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 276, in _forward_backward_pipelining_with_interleaving
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = p2p_communication.send_forward_recv_forward(
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 515, in send_forward_recv_forward
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.send([tensor], group_dst, tag)
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-DcKIWl (size 8257920), error: No space left on device (28)
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_with_interleaving
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 1 with exit code: 10
+[dist init] rank = 3, world_size = 8
+[dist init] rank = 5, world_size = 8
+[dist init] rank = 1, world_size = 8
+[dist init] rank = 7, world_size = 8
+[rank0]:[W603 16:08:25.303926414 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank4]:[W603 16:08:25.362910114 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank6]:[W603 16:08:25.383088872 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank2]:[W603 16:08:25.384296404 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank3]:[W603 16:08:25.847527234 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank5]:[W603 16:08:25.861409137 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank7]:[W603 16:08:25.913980770 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank1]:[W603 16:08:25.010703691 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+ERROR
+test_inference_async_pipelining_without_interleaving (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest) ... Process 3 terminated with exit code 10, terminating remaining processes.
+[rank5]:[W603 16:08:47.287850779 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:08:47.298819678 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank6]:[W603 16:08:47.301756687 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:08:47.427623361 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:08:47.498550246 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank4]:[W603 16:08:47.542999745 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank7]:[W603 16:08:47.836790322 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:08:47.847676459 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[dist init] rank = 6, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
+[dist init] rank = 4, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
+[dist init] rank = 2, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
+  warnings.warn("This function is only for unittest")
+[dist init] rank = 0, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
+  self.assertEqual(x.item() / microbatch_size, target_loss.item())
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
+  self.assertEqual(x.item() / microbatch_size, target_loss.item())
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 254, in test_inference_async_pipelining_without_interleaving
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 332, in forward_backward_pipelining_without_interleaving
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = recv_forward(
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-fcsADG (size 8257920), error: No space left on device (28)
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_without_interleaving
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 2 with exit code: 10
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 254, in test_inference_async_pipelining_without_interleaving
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 349, in forward_backward_pipelining_without_interleaving
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     send_forward(
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 145, in send_forward
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.send_forward(
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 401, in send_forward
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     _communicate(
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.send([tensor], group_dst, tag)
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-JQj2bD (size 8257920), error: No space left on device (28)
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_without_interleaving
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 0 with exit code: 10
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 254, in test_inference_async_pipelining_without_interleaving
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 367, in forward_backward_pipelining_without_interleaving
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor: List[Union[None, torch.Tensor, FutureTensor]] = recv_forward(tensor_shapes=recv_tensor_shapes, dtype=dtype, async_comm=async_comm)
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-jXAXk8 (size 8257920), error: No space left on device (28)
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_without_interleaving
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 6 with exit code: 10
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 254, in test_inference_async_pipelining_without_interleaving
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 332, in forward_backward_pipelining_without_interleaving
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = recv_forward(
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-py8DkW (size 8257920), error: No space left on device (28)
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_without_interleaving
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 4 with exit code: 10
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 254, in test_inference_async_pipelining_without_interleaving
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 332, in forward_backward_pipelining_without_interleaving
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = recv_forward(
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-MOrMPr (size 8257920), error: No space left on device (28)
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_without_interleaving
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 5 with exit code: 10
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 254, in test_inference_async_pipelining_without_interleaving
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 332, in forward_backward_pipelining_without_interleaving
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = recv_forward(
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-zATTI0 (size 8257920), error: No space left on device (28)
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_without_interleaving
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 3 with exit code: 10
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 254, in test_inference_async_pipelining_without_interleaving
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 367, in forward_backward_pipelining_without_interleaving
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor: List[Union[None, torch.Tensor, FutureTensor]] = recv_forward(tensor_shapes=recv_tensor_shapes, dtype=dtype, async_comm=async_comm)
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-XMZVej (size 8257920), error: No space left on device (28)
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_without_interleaving
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 7 with exit code: 10
+[dist init] rank = 5, world_size = 8
+[dist init] rank = 7, world_size = 8
+[dist init] rank = 3, world_size = 8
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 254, in test_inference_async_pipelining_without_interleaving
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 349, in forward_backward_pipelining_without_interleaving
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     send_forward(
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 145, in send_forward
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.send_forward(
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 401, in send_forward
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     _communicate(
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.send([tensor], group_dst, tag)
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-G7NK1l (size 8257920), error: No space left on device (28)
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_without_interleaving
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 1 with exit code: 10
+[dist init] rank = 1, world_size = 8
+[rank2]:[W603 16:09:36.882339790 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank4]:[W603 16:09:36.885695790 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank0]:[W603 16:09:36.995526595 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank6]:[W603 16:09:36.004199016 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank3]:[W603 16:09:37.466539945 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank1]:[W603 16:09:37.478031881 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank5]:[W603 16:09:37.493209837 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank7]:[W603 16:09:37.595542162 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+ERROR
+test_inference_no_pipelining (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest) ... Process 1 terminated with exit code 10, terminating remaining processes.
+[rank7]:[W603 16:09:58.973415270 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank5]:[W603 16:09:58.982096174 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:09:58.986582472 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:09:59.597541251 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:09:59.614054759 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank4]:[W603 16:09:59.618555048 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank6]:[W603 16:09:59.619095230 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:09:59.619297020 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
+  warnings.warn("This function is only for unittest")
+[dist init] rank = 0, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
+  self.assertEqual(x.item() / microbatch_size, target_loss.item())
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
+  self.assertEqual(x.item() / microbatch_size, target_loss.item())
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
+  self.assertEqual(x.item() / microbatch_size, target_loss.item())
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
+  self.assertEqual(x.item() / microbatch_size, target_loss.item())
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
+  self.assertEqual(x.item() / microbatch_size, target_loss.item())
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
+  self.assertEqual(x.item() / microbatch_size, target_loss.item())
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
+  self.assertEqual(x.item() / microbatch_size, target_loss.item())
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
+  self.assertEqual(x.item() / microbatch_size, target_loss.item())
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+[dist init] rank = 5, world_size = 8
+[dist init] rank = 7, world_size = 8
+[dist init] rank = 6, world_size = 8
+[dist init] rank = 4, world_size = 8
+[dist init] rank = 3, world_size = 8
+[dist init] rank = 1, world_size = 8
+[dist init] rank = 2, world_size = 8
+ok
+test_inference_pipelining_with_interleaving (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest) ... [rank3]:[W603 16:11:13.919670522 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:11:13.938326980 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank5]:[W603 16:11:13.939926885 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank4]:[W603 16:11:13.941251481 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:11:13.942232367 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank7]:[W603 16:11:13.945283054 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank6]:[W603 16:11:13.956809010 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:11:13.960771772 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
+  warnings.warn("This function is only for unittest")
+[dist init] rank = 0, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
+[dist init] rank = 4, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
+[dist init] rank = 2, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
+[dist init] rank = 6, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
+  self.assertEqual(x.item() / microbatch_size, target_loss.item())
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
+  self.assertEqual(x.item() / microbatch_size, target_loss.item())
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 266, in test_inference_pipelining_with_interleaving
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-F0ZAWL (size 8257920), error: No space left on device (28)
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_with_interleaving
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 5 with exit code: 10
+[dist init] rank = 5, world_size = 8
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 266, in test_inference_pipelining_with_interleaving
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 276, in _forward_backward_pipelining_with_interleaving
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = p2p_communication.send_forward_recv_forward(
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 515, in send_forward_recv_forward
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.send([tensor], group_dst, tag)
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-QgAUbi (size 8257920), error: No space left on device (28)
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_with_interleaving
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 1 with exit code: 10
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 266, in test_inference_pipelining_with_interleaving
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-lidQW9 (size 8257920), error: No space left on device (28)
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_with_interleaving
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 7 with exit code: 10
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 266, in test_inference_pipelining_with_interleaving
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-ya39Qh (size 8257920), error: No space left on device (28)
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_with_interleaving
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 3 with exit code: 10
+[dist init] rank = 1, world_size = 8
+[dist init] rank = 7, world_size = 8
+[dist init] rank = 3, world_size = 8
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 266, in test_inference_pipelining_with_interleaving
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-7nTQUd (size 8257920), error: No space left on device (28)
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_with_interleaving
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 6 with exit code: 10
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 266, in test_inference_pipelining_with_interleaving
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-UxtCFa (size 8257920), error: No space left on device (28)
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_with_interleaving
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 4 with exit code: 10
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 266, in test_inference_pipelining_with_interleaving
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 276, in _forward_backward_pipelining_with_interleaving
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = p2p_communication.send_forward_recv_forward(
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 515, in send_forward_recv_forward
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.send([tensor], group_dst, tag)
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-VMshzE (size 8257920), error: No space left on device (28)
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_with_interleaving
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 0 with exit code: 10
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 266, in test_inference_pipelining_with_interleaving
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-veLfEs (size 8257920), error: No space left on device (28)
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_with_interleaving
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 2 with exit code: 10
+[rank6]:[W603 16:12:27.325486798 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank2]:[W603 16:12:27.341952636 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank4]:[W603 16:12:27.359584073 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank0]:[W603 16:12:27.428477181 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank7]:[W603 16:12:27.760319732 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank3]:[W603 16:12:27.767121477 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank1]:[W603 16:12:27.809854378 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank5]:[W603 16:12:27.932506932 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+ERROR
+test_inference_pipelining_without_interleaving (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest) ... Process 7 terminated with exit code 10, terminating remaining processes.
+[rank0]:[W603 16:12:49.087699947 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank5]:[W603 16:12:49.103615317 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank7]:[W603 16:12:49.117766760 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:12:49.126433052 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank4]:[W603 16:12:49.181958389 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:12:49.185125920 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank6]:[W603 16:12:49.191761358 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:12:49.201756693 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[dist init] rank = 2, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
+[dist init] rank = 4, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
+  warnings.warn("This function is only for unittest")
+[dist init] rank = 6, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
+[dist init] rank = 0, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
+  self.assertEqual(x.item() / microbatch_size, target_loss.item())
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
+  self.assertEqual(x.item() / microbatch_size, target_loss.item())
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 244, in test_inference_pipelining_without_interleaving
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 332, in forward_backward_pipelining_without_interleaving
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = recv_forward(
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-WflEm3 (size 8257920), error: No space left on device (28)
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_without_interleaving
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 4 with exit code: 10
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 244, in test_inference_pipelining_without_interleaving
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 332, in forward_backward_pipelining_without_interleaving
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = recv_forward(
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-8PJSpS (size 8257920), error: No space left on device (28)
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_without_interleaving
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 2 with exit code: 10
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 244, in test_inference_pipelining_without_interleaving
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 349, in forward_backward_pipelining_without_interleaving
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     send_forward(
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 145, in send_forward
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.send_forward(
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 401, in send_forward
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     _communicate(
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.send([tensor], group_dst, tag)
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-76Ma69 (size 8257920), error: No space left on device (28)
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_without_interleaving
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 0 with exit code: 10
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 244, in test_inference_pipelining_without_interleaving
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 367, in forward_backward_pipelining_without_interleaving
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor: List[Union[None, torch.Tensor, FutureTensor]] = recv_forward(tensor_shapes=recv_tensor_shapes, dtype=dtype, async_comm=async_comm)
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-5OXFxk (size 8257920), error: No space left on device (28)
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_without_interleaving
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 6 with exit code: 10
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 244, in test_inference_pipelining_without_interleaving
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 349, in forward_backward_pipelining_without_interleaving
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     send_forward(
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 145, in send_forward
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.send_forward(
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 401, in send_forward
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     _communicate(
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.send([tensor], group_dst, tag)
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-G3jTgB (size 8257920), error: No space left on device (28)
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_without_interleaving
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 1 with exit code: 10
+[dist init] rank = 1, world_size = 8
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 244, in test_inference_pipelining_without_interleaving
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 332, in forward_backward_pipelining_without_interleaving
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = recv_forward(
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-UuQrwh (size 8257920), error: No space left on device (28)
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_without_interleaving
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 3 with exit code: 10
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 244, in test_inference_pipelining_without_interleaving
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 367, in forward_backward_pipelining_without_interleaving
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor: List[Union[None, torch.Tensor, FutureTensor]] = recv_forward(tensor_shapes=recv_tensor_shapes, dtype=dtype, async_comm=async_comm)
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-SMCF6k (size 8257920), error: No space left on device (28)
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_without_interleaving
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 7 with exit code: 10
+[dist init] rank = 3, world_size = 8
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 244, in test_inference_pipelining_without_interleaving
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 332, in forward_backward_pipelining_without_interleaving
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = recv_forward(
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-eVwAPm (size 8257920), error: No space left on device (28)
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_without_interleaving
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 5 with exit code: 10
+[dist init] rank = 7, world_size = 8
+[dist init] rank = 5, world_size = 8
+[rank2]:[W603 16:13:52.645901574 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank4]:[W603 16:13:52.659257967 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank0]:[W603 16:13:52.771542944 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank6]:[W603 16:13:52.781064435 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank1]:[W603 16:13:53.269871734 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank3]:[W603 16:13:53.278681750 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank7]:[W603 16:13:53.365874516 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank5]:[W603 16:13:53.374848385 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+ERROR
+test_inference_pipelining_without_interleaving_ucc_for_p2p (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest) ... Process 1 terminated with exit code 10, terminating remaining processes.
+[rank0]:[W603 16:14:15.213860590 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank6]:[W603 16:14:15.229402887 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank4]:[W603 16:14:15.237331454 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:14:15.240780322 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank7]:[W603 16:14:15.246388306 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:14:15.251613170 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank5]:[W603 16:14:15.262122898 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:14:15.264435740 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[dist init] rank = 0, world_size = 8
+[dist init] rank = 3, world_size = 8
+[dist init] rank = 4, world_size = 8
+[dist init] rank = 5, world_size = 8
+[dist init] rank = 2, world_size = 8
+[dist init] rank = 7, world_size = 8
+[dist init] rank = 1, world_size = 8
+[dist init] rank = 6, world_size = 8
+[rank0]:[W603 16:14:24.075517383 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+skipped 'Test skipped at subprocess level, look at subprocess log for skip reason'
+test_learning_async_pipelining_with_interleaving (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest) ... [rank0]:[W603 16:14:36.025146016 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:14:37.810567302 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:14:37.830866809 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:14:37.831099595 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank4]:[W603 16:14:37.833581789 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank7]:[W603 16:14:37.850227383 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank6]:[W603 16:14:37.852124078 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank5]:[W603 16:14:37.852242875 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[dist init] rank = 4, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
+[dist init] rank = 2, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
+  warnings.warn("This function is only for unittest")
+[dist init] rank = 6, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
+[dist init] rank = 0, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
+  self.assertEqual(x.item() / microbatch_size, target_loss.item())
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
+  self.assertEqual(x.item() / microbatch_size, target_loss.item())
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 272, in test_learning_async_pipelining_with_interleaving
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 276, in _forward_backward_pipelining_with_interleaving
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = p2p_communication.send_forward_recv_forward(
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 515, in send_forward_recv_forward
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.send([tensor], group_dst, tag)
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-BS3aqb (size 8257920), error: No space left on device (28)
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_with_interleaving
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 0 with exit code: 10
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 272, in test_learning_async_pipelining_with_interleaving
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-vuRJwT (size 8257920), error: No space left on device (28)
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_with_interleaving
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 2 with exit code: 10
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 272, in test_learning_async_pipelining_with_interleaving
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-nVYHSK (size 8257920), error: No space left on device (28)
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_with_interleaving
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 4 with exit code: 10
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 272, in test_learning_async_pipelining_with_interleaving
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-kcfrRq (size 8257920), error: No space left on device (28)
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_with_interleaving
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 6 with exit code: 10
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 272, in test_learning_async_pipelining_with_interleaving
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-fTb69h (size 8257920), error: No space left on device (28)
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_with_interleaving
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 5 with exit code: 10
+[dist init] rank = 5, world_size = 8
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 272, in test_learning_async_pipelining_with_interleaving
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-nWTLTe (size 8257920), error: No space left on device (28)
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_with_interleaving
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 7 with exit code: 10
+[dist init] rank = 7, world_size = 8
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 272, in test_learning_async_pipelining_with_interleaving
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 276, in _forward_backward_pipelining_with_interleaving
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = p2p_communication.send_forward_recv_forward(
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 515, in send_forward_recv_forward
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.send([tensor], group_dst, tag)
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-2Un7eJ (size 8257920), error: No space left on device (28)
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_with_interleaving
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 1 with exit code: 10
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 272, in test_learning_async_pipelining_with_interleaving
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-ij6ANE (size 8257920), error: No space left on device (28)
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_with_interleaving
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 3 with exit code: 10
+[dist init] rank = 1, world_size = 8
+[dist init] rank = 3, world_size = 8
+[rank6]:[W603 16:16:56.562566005 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank0]:[W603 16:16:56.730181481 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank4]:[W603 16:16:56.736195096 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank2]:[W603 16:16:56.754515015 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank1]:[W603 16:16:57.298347870 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank3]:[W603 16:16:57.308824739 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank7]:[W603 16:16:57.335671324 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank5]:[W603 16:16:57.387429570 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+ERROR
+test_learning_async_pipelining_without_interleaving (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest) ... Process 6 terminated with exit code 10, terminating remaining processes.
+[rank6]:[W603 16:17:20.110087992 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:17:20.142174903 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank4]:[W603 16:17:20.152173974 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank5]:[W603 16:17:20.155374203 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank7]:[W603 16:17:20.171188554 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:17:20.175718717 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:17:20.180679259 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:17:20.252079828 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
+  warnings.warn("This function is only for unittest")
+[dist init] rank = 6, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
+[dist init] rank = 0, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+[dist init] rank = 4, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
+[dist init] rank = 2, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
+  self.assertEqual(x.item() / microbatch_size, target_loss.item())
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
+  self.assertEqual(x.item() / microbatch_size, target_loss.item())
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 249, in test_learning_async_pipelining_without_interleaving
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 332, in forward_backward_pipelining_without_interleaving
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = recv_forward(
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-gwbLVE (size 8257920), error: No space left on device (28)
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_without_interleaving
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 2 with exit code: 10
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 249, in test_learning_async_pipelining_without_interleaving
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 332, in forward_backward_pipelining_without_interleaving
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = recv_forward(
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-gP7wMi (size 8257920), error: No space left on device (28)
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_without_interleaving
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 4 with exit code: 10
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 249, in test_learning_async_pipelining_without_interleaving
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 332, in forward_backward_pipelining_without_interleaving
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = recv_forward(
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-EwoJJ3 (size 8257920), error: No space left on device (28)
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_without_interleaving
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 5 with exit code: 10
+[dist init] rank = 5, world_size = 8
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 249, in test_learning_async_pipelining_without_interleaving
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 367, in forward_backward_pipelining_without_interleaving
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor: List[Union[None, torch.Tensor, FutureTensor]] = recv_forward(tensor_shapes=recv_tensor_shapes, dtype=dtype, async_comm=async_comm)
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-bTPujE (size 8257920), error: No space left on device (28)
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_without_interleaving
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 7 with exit code: 10
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 249, in test_learning_async_pipelining_without_interleaving
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 349, in forward_backward_pipelining_without_interleaving
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     send_forward(
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 145, in send_forward
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.send_forward(
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 401, in send_forward
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     _communicate(
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.send([tensor], group_dst, tag)
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-OSTKDz (size 8257920), error: No space left on device (28)
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_without_interleaving
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 1 with exit code: 10
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 249, in test_learning_async_pipelining_without_interleaving
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 332, in forward_backward_pipelining_without_interleaving
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = recv_forward(
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-IVeBCr (size 8257920), error: No space left on device (28)
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_without_interleaving
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 3 with exit code: 10
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 249, in test_learning_async_pipelining_without_interleaving
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 367, in forward_backward_pipelining_without_interleaving
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor: List[Union[None, torch.Tensor, FutureTensor]] = recv_forward(tensor_shapes=recv_tensor_shapes, dtype=dtype, async_comm=async_comm)
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-DLYrhm (size 8257920), error: No space left on device (28)
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_without_interleaving
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 6 with exit code: 10
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 249, in test_learning_async_pipelining_without_interleaving
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 349, in forward_backward_pipelining_without_interleaving
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     send_forward(
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 145, in send_forward
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.send_forward(
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 401, in send_forward
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     _communicate(
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.send([tensor], group_dst, tag)
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-5FNsgR (size 8257920), error: No space left on device (28)
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_without_interleaving
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 0 with exit code: 10
+[dist init] rank = 1, world_size = 8
+[dist init] rank = 7, world_size = 8
+[dist init] rank = 3, world_size = 8
+[rank6]:[W603 16:19:14.744460635 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank2]:[W603 16:19:14.746772096 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank4]:[W603 16:19:14.760300394 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank0]:[W603 16:19:14.772900364 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank1]:[W603 16:19:15.356409455 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank3]:[W603 16:19:15.363179882 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank7]:[W603 16:19:15.494206214 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank5]:[W603 16:19:15.494956748 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+ERROR
+test_learning_no_pipelining (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest) ... Process 1 terminated with exit code 10, terminating remaining processes.
+[rank5]:[W603 16:19:37.713920562 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:19:37.722140622 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank4]:[W603 16:19:37.724227198 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank7]:[W603 16:19:37.730407901 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:19:37.734493395 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:19:37.907402606 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank6]:[W603 16:19:37.929812315 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:19:37.930026984 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
+  warnings.warn("This function is only for unittest")
+[dist init] rank = 0, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
+  self.assertEqual(x.item() / microbatch_size, target_loss.item())
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
+  self.assertEqual(x.item() / microbatch_size, target_loss.item())
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
+  self.assertEqual(x.item() / microbatch_size, target_loss.item())
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
+  self.assertEqual(x.item() / microbatch_size, target_loss.item())
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
+  self.assertEqual(x.item() / microbatch_size, target_loss.item())
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
+  self.assertEqual(x.item() / microbatch_size, target_loss.item())
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
+  self.assertEqual(x.item() / microbatch_size, target_loss.item())
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
+  self.assertEqual(x.item() / microbatch_size, target_loss.item())
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+[dist init] rank = 7, world_size = 8
+[dist init] rank = 6, world_size = 8
+[dist init] rank = 2, world_size = 8
+[dist init] rank = 5, world_size = 8
+[dist init] rank = 4, world_size = 8
+[dist init] rank = 1, world_size = 8
+[dist init] rank = 3, world_size = 8
+ok
+test_learning_pipelining_with_interleaving (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest) ... [rank1]:[W603 16:21:13.240201491 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:21:13.249418440 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank7]:[W603 16:21:13.249418210 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank5]:[W603 16:21:13.273189500 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:21:13.275414472 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank6]:[W603 16:21:13.276441496 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:21:13.295509256 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank4]:[W603 16:21:13.323185761 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[dist init] rank = 4, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
+  warnings.warn("This function is only for unittest")
+[dist init] rank = 2, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
+[dist init] rank = 6, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
+[dist init] rank = 0, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
+  self.assertEqual(x.item() / microbatch_size, target_loss.item())
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
+  self.assertEqual(x.item() / microbatch_size, target_loss.item())
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 260, in test_learning_pipelining_with_interleaving
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-g8m58H (size 8257920), error: No space left on device (28)
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_with_interleaving
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 7 with exit code: 10
+[dist init] rank = 7, world_size = 8
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 260, in test_learning_pipelining_with_interleaving
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 276, in _forward_backward_pipelining_with_interleaving
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = p2p_communication.send_forward_recv_forward(
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 515, in send_forward_recv_forward
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.send([tensor], group_dst, tag)
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-ANcY0t (size 8257920), error: No space left on device (28)
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_with_interleaving
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 1 with exit code: 10
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 260, in test_learning_pipelining_with_interleaving
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-UrziMF (size 8257920), error: No space left on device (28)
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_with_interleaving
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 3 with exit code: 10
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 260, in test_learning_pipelining_with_interleaving
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-jPvPWC (size 8257920), error: No space left on device (28)
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_with_interleaving
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 5 with exit code: 10
+[dist init] rank = 1, world_size = 8
+[dist init] rank = 3, world_size = 8
+[dist init] rank = 5, world_size = 8
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 260, in test_learning_pipelining_with_interleaving
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 276, in _forward_backward_pipelining_with_interleaving
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = p2p_communication.send_forward_recv_forward(
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 515, in send_forward_recv_forward
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.send([tensor], group_dst, tag)
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-jjObkm (size 8257920), error: No space left on device (28)
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_with_interleaving
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 0 with exit code: 10
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 260, in test_learning_pipelining_with_interleaving
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-IoM3re (size 8257920), error: No space left on device (28)
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_with_interleaving
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 4 with exit code: 10
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 260, in test_learning_pipelining_with_interleaving
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-tJxE3b (size 8257920), error: No space left on device (28)
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_with_interleaving
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 2 with exit code: 10
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 260, in test_learning_pipelining_with_interleaving
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-4zQyZf (size 8257920), error: No space left on device (28)
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_with_interleaving
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 6 with exit code: 10
+[rank2]:[W603 16:23:37.958873085 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank0]:[W603 16:23:37.997044803 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank4]:[W603 16:23:37.998627199 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank6]:[W603 16:23:38.116992402 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank3]:[W603 16:23:38.516222369 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank5]:[W603 16:23:38.541465240 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank1]:[W603 16:23:38.748065229 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank7]:[W603 16:23:38.794640305 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+ERROR
+test_learning_pipelining_without_interleaving (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest) ... Process 3 terminated with exit code 10, terminating remaining processes.
+[rank0]:[W603 16:24:01.300922058 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:24:01.310179142 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank4]:[W603 16:24:01.313455880 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:24:02.201775634 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank6]:[W603 16:24:02.208507283 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:24:02.208962478 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank7]:[W603 16:24:02.209742284 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank5]:[W603 16:24:02.219031365 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
+  warnings.warn("This function is only for unittest")
+[dist init] rank = 4, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
+[dist init] rank = 0, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+[dist init] rank = 2, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
+[dist init] rank = 6, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
+  self.assertEqual(x.item() / microbatch_size, target_loss.item())
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
+  self.assertEqual(x.item() / microbatch_size, target_loss.item())
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
+  torch.cuda.amp.GradScaler(init_scale=4.0)
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 239, in test_learning_pipelining_without_interleaving
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 332, in forward_backward_pipelining_without_interleaving
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = recv_forward(
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-2fZGs1 (size 8257920), error: No space left on device (28)
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_without_interleaving
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 5 with exit code: 10
+[dist init] rank = 5, world_size = 8
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 239, in test_learning_pipelining_without_interleaving
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 332, in forward_backward_pipelining_without_interleaving
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = recv_forward(
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-GPa5Ic (size 8257920), error: No space left on device (28)
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_without_interleaving
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 3 with exit code: 10
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 239, in test_learning_pipelining_without_interleaving
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 349, in forward_backward_pipelining_without_interleaving
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     send_forward(
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 145, in send_forward
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.send_forward(
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 401, in send_forward
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     _communicate(
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.send([tensor], group_dst, tag)
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-abqPSD (size 8257920), error: No space left on device (28)
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_without_interleaving
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 1 with exit code: 10
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 239, in test_learning_pipelining_without_interleaving
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 367, in forward_backward_pipelining_without_interleaving
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor: List[Union[None, torch.Tensor, FutureTensor]] = recv_forward(tensor_shapes=recv_tensor_shapes, dtype=dtype, async_comm=async_comm)
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-PAeuzA (size 8257920), error: No space left on device (28)
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_without_interleaving
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 7 with exit code: 10
+[dist init] rank = 1, world_size = 8
+[dist init] rank = 3, world_size = 8
+[dist init] rank = 7, world_size = 8
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 239, in test_learning_pipelining_without_interleaving
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 367, in forward_backward_pipelining_without_interleaving
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor: List[Union[None, torch.Tensor, FutureTensor]] = recv_forward(tensor_shapes=recv_tensor_shapes, dtype=dtype, async_comm=async_comm)
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-UuuwZX (size 8257920), error: No space left on device (28)
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_without_interleaving
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 6 with exit code: 10
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 239, in test_learning_pipelining_without_interleaving
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 332, in forward_backward_pipelining_without_interleaving
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = recv_forward(
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-jvUOv9 (size 8257920), error: No space left on device (28)
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_without_interleaving
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 2 with exit code: 10
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 239, in test_learning_pipelining_without_interleaving
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 332, in forward_backward_pipelining_without_interleaving
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = recv_forward(
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-auyogM (size 8257920), error: No space left on device (28)
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_without_interleaving
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 4 with exit code: 10
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 239, in test_learning_pipelining_without_interleaving
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 349, in forward_backward_pipelining_without_interleaving
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     send_forward(
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 145, in send_forward
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.send_forward(
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 401, in send_forward
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     _communicate(
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.send([tensor], group_dst, tag)
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-bYNB7m (size 8257920), error: No space left on device (28)
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_without_interleaving
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 0 with exit code: 10
+[rank5]:[W603 16:26:10.298413670 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank7]:[W603 16:26:10.307117426 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank3]:[W603 16:26:10.349196783 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank1]:[W603 16:26:10.354651742 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank6]:[W603 16:26:10.420917412 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank2]:[W603 16:26:10.432103319 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank4]:[W603 16:26:10.444288648 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank0]:[W603 16:26:10.468277920 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+ERROR
+test_learning_pipelining_without_interleaving_ucc_for_p2p (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest) ... Process 1 terminated with exit code 10, terminating remaining processes.
+[rank3]:[W603 16:26:32.370430829 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:26:32.374126929 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:26:32.374865154 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank7]:[W603 16:26:33.111700547 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank4]:[W603 16:26:33.222152363 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:26:33.230871453 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank5]:[W603 16:26:33.234719208 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank6]:[W603 16:26:33.237089914 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[dist init] rank = 7, world_size = 8
+[dist init] rank = 5, world_size = 8
+[dist init] rank = 1, world_size = 8
+[dist init] rank = 2, world_size = 8
+[dist init] rank = 6, world_size = 8
+[dist init] rank = 3, world_size = 8
+[dist init] rank = 4, world_size = 8
+[dist init] rank = 0, world_size = 8
+[rank0]:[W603 16:26:45.905335399 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+skipped 'Test skipped at subprocess level, look at subprocess log for skip reason'
+test_pipelining_without_interleaving_encoder_and_decoder (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelWithToyParallelMLP) ... [rank1]:[W603 16:26:59.114654676 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:26:59.138041697 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:26:59.151866578 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank5]:[W603 16:26:59.153898992 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank6]:[W603 16:26:59.155885838 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank7]:[W603 16:26:59.156481882 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank4]:[W603 16:26:59.159510874 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:26:59.164376950 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
+  warnings.warn("This function is only for unittest")
+[dist init] rank = 5, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (1, 2): 16576
+[dist init] rank = 3, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (1, 1): 16576
+[dist init] rank = 7, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (1, 3): 16576
+[dist init] rank = 4, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 16576
+[dist init] rank = 6, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 16576
+[dist init] rank = 0, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 16576
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+[dist init] rank = 2, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 16576
+[dist init] rank = 1, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 16576
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+ok
+test_pipelining_without_interleaving_encoder_or_decoder (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelWithToyParallelMLP) ... [rank6]:[W603 16:28:16.813641713 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank7]:[W603 16:28:16.822877040 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:28:16.822896689 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:28:16.826799597 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:28:16.830679000 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:28:16.831445467 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank5]:[W603 16:28:16.831964727 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank4]:[W603 16:28:16.832049082 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
+  warnings.warn("This function is only for unittest")
+[dist init] rank = 7, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (1, 3): 16576
+[dist init] rank = 3, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (1, 1): 16576
+[dist init] rank = 0, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 16576
+[dist init] rank = 4, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 16576
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+[dist init] rank = 1, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 16576
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+[dist init] rank = 6, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 16576
+[dist init] rank = 2, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 16576
+[dist init] rank = 5, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (1, 2): 16576
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+ok
+test_pipelining_without_interleaving_inferenc_encoder_and_decoder (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelWithToyParallelMLP) ... [rank5]:[W603 16:29:58.144385859 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:29:58.153746813 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank4]:[W603 16:29:58.154014020 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:29:58.166112505 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank7]:[W603 16:29:58.167303591 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:29:58.244365810 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:29:58.249077668 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank6]:[W603 16:29:58.250782958 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
+  warnings.warn("This function is only for unittest")
+[dist init] rank = 6, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 16576
+[dist init] rank = 7, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (1, 3): 16576
+[dist init] rank = 5, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (1, 2): 16576
+[dist init] rank = 1, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 16576
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+[dist init] rank = 2, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 16576
+[dist init] rank = 3, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (1, 1): 16576
+[dist init] rank = 0, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 16576
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+[dist init] rank = 4, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 16576
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+ok
+test_pipelining_without_interleaving_inference_sequence_paralle_encoder_and_decoder (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelWithToyParallelMLP) ... [rank5]:[W603 16:30:36.818185801 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:30:36.820888440 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank4]:[W603 16:30:36.822907925 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:30:36.834983897 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank6]:[W603 16:30:36.911624183 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank7]:[W603 16:30:36.919864365 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:30:37.098846840 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:30:37.101206801 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
+  warnings.warn("This function is only for unittest")
+[dist init] rank = 5, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (1, 2): 16576
+[dist init] rank = 6, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 16576
+[dist init] rank = 4, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 16576
+[dist init] rank = 0, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 16576
+[dist init] rank = 7, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (1, 3): 16576
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+[dist init] rank = 3, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (1, 1): 16576
+[dist init] rank = 1, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 16576
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+[dist init] rank = 2, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 16576
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+ok
+test_pipelining_without_interleaving_sequence_paralle_encoder_and_decoder (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelWithToyParallelMLP) ... [rank7]:[W603 16:31:14.282970134 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:31:14.397246689 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank4]:[W603 16:31:14.401349996 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:31:14.402028883 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank5]:[W603 16:31:14.403024530 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:31:14.405795309 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:31:14.472123643 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank6]:[W603 16:31:14.485535778 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
+  warnings.warn("This function is only for unittest")
+[dist init] rank = 6, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 16576
+[dist init] rank = 4, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 16576
+[dist init] rank = 7, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (1, 3): 16576
+[dist init] rank = 0, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 16576
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+[dist init] rank = 2, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 16576
+[dist init] rank = 1, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 16576
+[dist init] rank = 3, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (1, 1): 16576
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+[dist init] rank = 5, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (1, 2): 16576
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  handle = torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  handle = torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  handle = torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  handle = torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  handle = torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  handle = torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  handle = torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  handle = torch.distributed._reduce_scatter_base(
+ok
+test_pipelining_without_interleaving_sequence_parallel_encoder_or_decoder (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelWithToyParallelMLP) ... [rank0]:[W603 16:32:27.723335566 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:32:27.726510125 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:32:27.735554250 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:32:27.735853684 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank4]:[W603 16:32:27.739658569 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank5]:[W603 16:32:27.747217020 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank7]:[W603 16:32:27.755008258 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank6]:[W603 16:32:27.755425907 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
+  warnings.warn("This function is only for unittest")
+[dist init] rank = 1, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 16576
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+[dist init] rank = 7, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (1, 3): 16576
+[dist init] rank = 5, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (1, 2): 16576
+[dist init] rank = 2, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 16576
+[dist init] rank = 4, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 16576
+[dist init] rank = 6, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 16576
+[dist init] rank = 0, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 16576
+[dist init] rank = 3, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (1, 1): 16576
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  handle = torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  handle = torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  handle = torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  handle = torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  handle = torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  handle = torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  handle = torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  handle = torch.distributed._reduce_scatter_base(
+ok
+test_pipelining_without_interleaving_sequence_parallel_encoder_or_decoder_half (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelWithToyParallelMLP) ... [rank2]:[W603 16:33:41.746748332 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank4]:[W603 16:33:41.767579754 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:33:41.769077596 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank6]:[W603 16:33:41.770638701 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:33:42.345720967 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank7]:[W603 16:33:42.362124551 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank5]:[W603 16:33:42.378914949 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:33:42.380083462 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
+  warnings.warn("This function is only for unittest")
+[dist init] rank = 6, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 16576
+[dist init] rank = 4, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 16576
+[dist init] rank = 0, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 16576
+[dist init] rank = 7, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (1, 3): 16576
+[dist init] rank = 2, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 16576
+[dist init] rank = 3, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (1, 1): 16576
+[dist init] rank = 5, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (1, 2): 16576
+[dist init] rank = 1, world_size = 8
+ > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 16576
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/_autocast_utils.py:26: FutureWarning: `torch.cuda.amp.autocast_mode._cast(value, dtype)` is deprecated. Please use `torch.amp.autocast_mode._cast(value, 'cuda', dtype)` instead.
+  return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/_autocast_utils.py:26: FutureWarning: `torch.cuda.amp.autocast_mode._cast(value, dtype)` is deprecated. Please use `torch.amp.autocast_mode._cast(value, 'cuda', dtype)` instead.
+  return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/_autocast_utils.py:26: FutureWarning: `torch.cuda.amp.autocast_mode._cast(value, dtype)` is deprecated. Please use `torch.amp.autocast_mode._cast(value, 'cuda', dtype)` instead.
+  return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/_autocast_utils.py:26: FutureWarning: `torch.cuda.amp.autocast_mode._cast(value, dtype)` is deprecated. Please use `torch.amp.autocast_mode._cast(value, 'cuda', dtype)` instead.
+  return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/_autocast_utils.py:26: FutureWarning: `torch.cuda.amp.autocast_mode._cast(value, dtype)` is deprecated. Please use `torch.amp.autocast_mode._cast(value, 'cuda', dtype)` instead.
+  return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/_autocast_utils.py:26: FutureWarning: `torch.cuda.amp.autocast_mode._cast(value, dtype)` is deprecated. Please use `torch.amp.autocast_mode._cast(value, 'cuda', dtype)` instead.
+  return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/_autocast_utils.py:26: FutureWarning: `torch.cuda.amp.autocast_mode._cast(value, dtype)` is deprecated. Please use `torch.amp.autocast_mode._cast(value, 'cuda', dtype)` instead.
+  return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/_autocast_utils.py:26: FutureWarning: `torch.cuda.amp.autocast_mode._cast(value, dtype)` is deprecated. Please use `torch.amp.autocast_mode._cast(value, 'cuda', dtype)` instead.
+  return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())
+/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
+  return func(*args, **kwargs)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  handle = torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  handle = torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  handle = torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  handle = torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  handle = torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  handle = torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  handle = torch.distributed._reduce_scatter_base(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
+  handle = torch.distributed._reduce_scatter_base(
+ok
+test_inference_async_pipelining_with_interleaving (test_pipeline_parallel_fwd_bwd.UccPipelineParallelForwardBackwardTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_inference_async_pipelining_without_interleaving (test_pipeline_parallel_fwd_bwd.UccPipelineParallelForwardBackwardTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_inference_no_pipelining (test_pipeline_parallel_fwd_bwd.UccPipelineParallelForwardBackwardTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_inference_pipelining_with_interleaving (test_pipeline_parallel_fwd_bwd.UccPipelineParallelForwardBackwardTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_inference_pipelining_without_interleaving (test_pipeline_parallel_fwd_bwd.UccPipelineParallelForwardBackwardTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_learning_async_pipelining_with_interleaving (test_pipeline_parallel_fwd_bwd.UccPipelineParallelForwardBackwardTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_learning_async_pipelining_without_interleaving (test_pipeline_parallel_fwd_bwd.UccPipelineParallelForwardBackwardTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_learning_no_pipelining (test_pipeline_parallel_fwd_bwd.UccPipelineParallelForwardBackwardTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_learning_pipelining_with_interleaving (test_pipeline_parallel_fwd_bwd.UccPipelineParallelForwardBackwardTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_learning_pipelining_without_interleaving (test_pipeline_parallel_fwd_bwd.UccPipelineParallelForwardBackwardTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_cuda_rng_tracker (test_random.NcclTransformerRandomTest) ... [rank0]:[W603 16:34:39.248877851 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:34:39.259966901 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:34:39.268915315 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:34:39.272702934 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+/skishore/github/apex/tests/L0/run_transformer/test_random.py:73: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
+  tensor = torch.cuda.FloatTensor(size)
+/skishore/github/apex/tests/L0/run_transformer/test_random.py:73: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
+  tensor = torch.cuda.FloatTensor(size)
+/skishore/github/apex/tests/L0/run_transformer/test_random.py:73: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
+  tensor = torch.cuda.FloatTensor(size)
+/skishore/github/apex/tests/L0/run_transformer/test_random.py:73: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
+  tensor = torch.cuda.FloatTensor(size)
+[dist init] rank = 0, world_size = 4
+[dist init] rank = 1, world_size = 4
+[dist init] rank = 3, world_size = 4
+[dist init] rank = 2, world_size = 4
+ok
+test_set_cuda_rng_state (test_random.NcclTransformerRandomTest) ... [rank1]:[W603 16:34:54.482892089 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:34:54.488903577 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:34:54.490081995 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:34:54.970732545 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+/skishore/github/apex/tests/L0/run_transformer/test_random.py:30: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
+  tensor = torch.cuda.FloatTensor(size)
+/skishore/github/apex/tests/L0/run_transformer/test_random.py:30: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
+  tensor = torch.cuda.FloatTensor(size)
+/skishore/github/apex/tests/L0/run_transformer/test_random.py:30: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
+  tensor = torch.cuda.FloatTensor(size)
+/skishore/github/apex/tests/L0/run_transformer/test_random.py:30: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
+  tensor = torch.cuda.FloatTensor(size)
+[dist init] rank = 0, world_size = 4
+[dist init] rank = 3, world_size = 4
+[dist init] rank = 1, world_size = 4
+[dist init] rank = 2, world_size = 4
+ok
+test_cuda_rng_tracker (test_random.UccTransformerRandomTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_set_cuda_rng_state (test_random.UccTransformerRandomTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
+test_transformer (test_transformer_module.TestTransformer) ... 
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/skishore/github/pytorch/torch/autograd/graph.py:824: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at /skishore/github/pytorch/torch/csrc/autograd/autograd_not_implemented_fallback.cpp:62.)
+  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+/skishore/github/pytorch/torch/autograd/graph.py:824: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at /skishore/github/pytorch/torch/csrc/autograd/autograd_not_implemented_fallback.cpp:62.)
+  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+/skishore/github/pytorch/torch/autograd/graph.py:824: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at /skishore/github/pytorch/torch/csrc/autograd/autograd_not_implemented_fallback.cpp:62.)
+  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+/skishore/github/pytorch/torch/autograd/graph.py:824: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at /skishore/github/pytorch/torch/csrc/autograd/autograd_not_implemented_fallback.cpp:62.)
+  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+[rank7]:[W603 16:37:20.454816579 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank6]:[W603 16:37:20.459721136 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank4]:[W603 16:37:20.467136577 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank5]:[W603 16:37:20.469727852 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:37:20.475567385 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:37:20.479225452 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:37:20.489246729 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:37:20.490170971 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:37:35.197418010 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank3]:[W603 16:37:35.202581912 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank1]:[W603 16:37:35.214856702 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank2]:[W603 16:37:35.232038583 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank4]:[W603 16:37:35.235840664 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank5]:[W603 16:37:35.274117862 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank7]:[W603 16:37:35.278545080 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank6]:[W603 16:37:35.302280844 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/data.py:50: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
+  sizes_cuda = torch.cuda.LongTensor(sizes)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/data.py:50: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
+  sizes_cuda = torch.cuda.LongTensor(sizes)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/data.py:50: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
+  sizes_cuda = torch.cuda.LongTensor(sizes)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/data.py:50: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
+  sizes_cuda = torch.cuda.LongTensor(sizes)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/data.py:50: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
+  sizes_cuda = torch.cuda.LongTensor(sizes)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/data.py:50: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
+  sizes_cuda = torch.cuda.LongTensor(sizes)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/data.py:50: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
+  sizes_cuda = torch.cuda.LongTensor(sizes)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/data.py:50: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
+  sizes_cuda = torch.cuda.LongTensor(sizes)
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/skishore/github/pytorch/torch/autograd/graph.py:824: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at /skishore/github/pytorch/torch/csrc/autograd/autograd_not_implemented_fallback.cpp:62.)
+  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+/skishore/github/pytorch/torch/autograd/graph.py:824: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at /skishore/github/pytorch/torch/csrc/autograd/autograd_not_implemented_fallback.cpp:62.)
+  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+/skishore/github/pytorch/torch/autograd/graph.py:824: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at /skishore/github/pytorch/torch/csrc/autograd/autograd_not_implemented_fallback.cpp:62.)
+  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+/skishore/github/pytorch/torch/autograd/graph.py:824: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at /skishore/github/pytorch/torch/csrc/autograd/autograd_not_implemented_fallback.cpp:62.)
+  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+[rank7]:[W603 16:43:01.663759187 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank6]:[W603 16:43:01.663999103 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank5]:[W603 16:43:01.760702253 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank4]:[W603 16:43:01.760939295 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank3]:[W603 16:43:01.779388598 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:43:01.780375512 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:43:01.798539673 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:43:01.798610107 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:43:09.017878787 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank6]:[W603 16:43:10.036188324 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank4]:[W603 16:43:10.052884716 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank7]:[W603 16:43:10.062708111 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank1]:[W603 16:43:10.063044120 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank3]:[W603 16:43:10.064505398 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank5]:[W603 16:43:10.066178591 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank0]:[W603 16:43:10.071842795 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[rank3]:[W603 16:43:35.670623763 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank7]:[W603 16:43:35.670765383 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:43:35.672929394 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank5]:[W603 16:43:35.676482316 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
+  warnings.warn("This function is only for unittest")
+[rank0]:[W603 16:43:35.691670551 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank6]:[W603 16:43:35.699256296 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:43:36.063533624 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank4]:[W603 16:43:36.073806186 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(
+/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
+  warnings.warn("This function is only for unittest")
+[rank0]:[W603 16:46:11.098930997 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank4]:[W603 16:46:11.115717975 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank2]:[W603 16:46:11.119934281 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank6]:[W603 16:46:11.123500132 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank1]:[W603 16:46:11.125988935 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank5]:[W603 16:46:11.131786406 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank3]:[W603 16:46:11.133387652 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank7]:[W603 16:46:11.135553105 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+ok
+test_split_tensor_along_last_dim (test_transformer_utils.TransformerUtilsTest) ... #######################################################
+# Python executable path: /opt/conda/envs/py_3.10/bin/python
+# 3 tests: ['/skishore/github/apex/tests/L0/run_transformer/run_gpt_minimal_test.py', '/skishore/github/apex/tests/L0/run_transformer/run_bert_minimal_test.py', '/skishore/github/apex/tests/L0/run_transformer/run_dynamic_batchsize_test.py']
+#######################################################
+### 1 / 3: cmd: /opt/conda/envs/py_3.10/bin/python -m torch.distributed.run --nproc_per_node=8 /skishore/github/apex/tests/L0/run_transformer/run_gpt_minimal_test.py --micro-batch-size 2 --num-layers 16 --hidden-size 256 --num-attention-heads 8 --max-position-embeddings 512 --seq-length 512 --global-batch-size 128 --pipeline-model-parallel-size 4 --tensor-model-parallel-size 2
+### 2 / 3: cmd: /opt/conda/envs/py_3.10/bin/python -m torch.distributed.run --nproc_per_node=8 /skishore/github/apex/tests/L0/run_transformer/run_bert_minimal_test.py --micro-batch-size 2 --num-layers 16 --hidden-size 256 --num-attention-heads 8 --max-position-embeddings 512 --seq-length 512 --global-batch-size 128 --pipeline-model-parallel-size 4 --tensor-model-parallel-size 2 --bert-no-binary-head
+### 3 / 3: cmd: /opt/conda/envs/py_3.10/bin/python -m torch.distributed.run --nproc_per_node=8 /skishore/github/apex/tests/L0/run_transformer/run_dynamic_batchsize_test.py --micro-batch-size 2 --num-layers 16 --hidden-size 256 --num-attention-heads 8 --max-position-embeddings 512 --seq-length 512 --global-batch-size 128 --use-cpu-initialization
+### PASSED
+[rank3]:[W603 16:46:31.117388689 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank1]:[W603 16:46:31.130568896 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank0]:[W603 16:46:31.163744050 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[rank2]:[W603 16:46:31.206279493 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
+[dist init] rank = 3, world_size = 4
+[dist init] rank = 0, world_size = 4
+[dist init] rank = 1, world_size = 4
+[dist init] rank = 2, world_size = 4
+ok
+
+======================================================================
+ERROR: test_inference_async_pipelining_with_interleaving (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest)
+----------------------------------------------------------------------
+Traceback (most recent call last):
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 605, in wrapper
+    self._join_processes(fn)
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 845, in _join_processes
+    self._check_return_codes(elapsed_time)
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 894, in _check_return_codes
+    raise RuntimeError(error)
+RuntimeError: Process 3 exited with error code 10 and exception:
+Traceback (most recent call last):
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+    getattr(self, test_name)()
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+    fn()
+  File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+    method(*args, **kwargs)
+  File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 278, in test_inference_async_pipelining_with_interleaving
+    self._forward_backward_test_impl(
+  File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+    loss = fwd_bwd_func(
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
+    p2p_communication.recv_forward(
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+    input_tensor, _ = _communicate(
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+    tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+    reqs = torch.distributed.batch_isend_irecv(ops)
+  File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+    p2p_op.op(
+  File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+    return group.recv([tensor], group_src, tag)
+torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+Last error:
+Error while creating shared memory segment /dev/shm/nccl-BZatXC (size 8257920), error: No space left on device (28)
+
+To execute this test, run the following from the base repo dir:
+    PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_with_interleaving
+
+This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+
+
+
+======================================================================
+ERROR: test_inference_async_pipelining_without_interleaving (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest)
+----------------------------------------------------------------------
+Traceback (most recent call last):
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 605, in wrapper
+    self._join_processes(fn)
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 845, in _join_processes
+    self._check_return_codes(elapsed_time)
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 894, in _check_return_codes
+    raise RuntimeError(error)
+RuntimeError: Process 1 exited with error code 10 and exception:
+Traceback (most recent call last):
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+    getattr(self, test_name)()
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+    fn()
+  File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+    method(*args, **kwargs)
+  File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 254, in test_inference_async_pipelining_without_interleaving
+    self._forward_backward_test_impl(
+  File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+    loss = fwd_bwd_func(
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 349, in forward_backward_pipelining_without_interleaving
+    send_forward(
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 145, in send_forward
+    p2p_communication.send_forward(
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 401, in send_forward
+    _communicate(
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+    tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+    reqs = torch.distributed.batch_isend_irecv(ops)
+  File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+    p2p_op.op(
+  File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
+    return group.send([tensor], group_dst, tag)
+torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+Last error:
+Error while creating shared memory segment /dev/shm/nccl-G7NK1l (size 8257920), error: No space left on device (28)
+
+To execute this test, run the following from the base repo dir:
+    PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_without_interleaving
+
+This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+
+
+
+======================================================================
+ERROR: test_inference_pipelining_with_interleaving (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest)
+----------------------------------------------------------------------
+Traceback (most recent call last):
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 605, in wrapper
+    self._join_processes(fn)
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 845, in _join_processes
+    self._check_return_codes(elapsed_time)
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 894, in _check_return_codes
+    raise RuntimeError(error)
+RuntimeError: Process 7 exited with error code 10 and exception:
+Traceback (most recent call last):
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+    getattr(self, test_name)()
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+    fn()
+  File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+    method(*args, **kwargs)
+  File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 266, in test_inference_pipelining_with_interleaving
+    self._forward_backward_test_impl(
+  File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+    loss = fwd_bwd_func(
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
+    p2p_communication.recv_forward(
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+    input_tensor, _ = _communicate(
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+    tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+    reqs = torch.distributed.batch_isend_irecv(ops)
+  File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+    p2p_op.op(
+  File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+    return group.recv([tensor], group_src, tag)
+torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+Last error:
+Error while creating shared memory segment /dev/shm/nccl-lidQW9 (size 8257920), error: No space left on device (28)
+
+To execute this test, run the following from the base repo dir:
+    PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_with_interleaving
+
+This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+
+
+
+======================================================================
+ERROR: test_inference_pipelining_without_interleaving (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest)
+----------------------------------------------------------------------
+Traceback (most recent call last):
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 605, in wrapper
+    self._join_processes(fn)
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 845, in _join_processes
+    self._check_return_codes(elapsed_time)
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 894, in _check_return_codes
+    raise RuntimeError(error)
+RuntimeError: Process 1 exited with error code 10 and exception:
+Traceback (most recent call last):
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+    getattr(self, test_name)()
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+    fn()
+  File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+    method(*args, **kwargs)
+  File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 244, in test_inference_pipelining_without_interleaving
+    self._forward_backward_test_impl(
+  File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+    loss = fwd_bwd_func(
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 349, in forward_backward_pipelining_without_interleaving
+    send_forward(
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 145, in send_forward
+    p2p_communication.send_forward(
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 401, in send_forward
+    _communicate(
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+    tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+    reqs = torch.distributed.batch_isend_irecv(ops)
+  File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+    p2p_op.op(
+  File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
+    return group.send([tensor], group_dst, tag)
+torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+Last error:
+Error while creating shared memory segment /dev/shm/nccl-G3jTgB (size 8257920), error: No space left on device (28)
+
+To execute this test, run the following from the base repo dir:
+    PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_without_interleaving
+
+This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+
+
+
+======================================================================
+ERROR: test_learning_async_pipelining_with_interleaving (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest)
+----------------------------------------------------------------------
+Traceback (most recent call last):
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 605, in wrapper
+    self._join_processes(fn)
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 845, in _join_processes
+    self._check_return_codes(elapsed_time)
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 894, in _check_return_codes
+    raise RuntimeError(error)
+RuntimeError: Process 6 exited with error code 10 and exception:
+Traceback (most recent call last):
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+    getattr(self, test_name)()
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+    fn()
+  File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+    method(*args, **kwargs)
+  File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 272, in test_learning_async_pipelining_with_interleaving
+    self._forward_backward_test_impl(
+  File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+    loss = fwd_bwd_func(
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
+    p2p_communication.recv_forward(
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+    input_tensor, _ = _communicate(
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+    tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+    reqs = torch.distributed.batch_isend_irecv(ops)
+  File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+    p2p_op.op(
+  File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+    return group.recv([tensor], group_src, tag)
+torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+Last error:
+Error while creating shared memory segment /dev/shm/nccl-kcfrRq (size 8257920), error: No space left on device (28)
+
+To execute this test, run the following from the base repo dir:
+    PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_with_interleaving
+
+This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+
+
+
+======================================================================
+ERROR: test_learning_async_pipelining_without_interleaving (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest)
+----------------------------------------------------------------------
+Traceback (most recent call last):
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 605, in wrapper
+    self._join_processes(fn)
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 845, in _join_processes
+    self._check_return_codes(elapsed_time)
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 894, in _check_return_codes
+    raise RuntimeError(error)
+RuntimeError: Process 1 exited with error code 10 and exception:
+Traceback (most recent call last):
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+    getattr(self, test_name)()
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+    fn()
+  File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+    method(*args, **kwargs)
+  File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 249, in test_learning_async_pipelining_without_interleaving
+    self._forward_backward_test_impl(
+  File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+    loss = fwd_bwd_func(
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 349, in forward_backward_pipelining_without_interleaving
+    send_forward(
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 145, in send_forward
+    p2p_communication.send_forward(
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 401, in send_forward
+    _communicate(
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+    tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+    reqs = torch.distributed.batch_isend_irecv(ops)
+  File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+    p2p_op.op(
+  File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
+    return group.send([tensor], group_dst, tag)
+torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+Last error:
+Error while creating shared memory segment /dev/shm/nccl-OSTKDz (size 8257920), error: No space left on device (28)
+
+To execute this test, run the following from the base repo dir:
+    PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_without_interleaving
+
+This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+
+
+
+======================================================================
+ERROR: test_learning_pipelining_with_interleaving (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest)
+----------------------------------------------------------------------
+Traceback (most recent call last):
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 605, in wrapper
+    self._join_processes(fn)
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 845, in _join_processes
+    self._check_return_codes(elapsed_time)
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 894, in _check_return_codes
+    raise RuntimeError(error)
+RuntimeError: Process 3 exited with error code 10 and exception:
+Traceback (most recent call last):
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+    getattr(self, test_name)()
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+    fn()
+  File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+    method(*args, **kwargs)
+  File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 260, in test_learning_pipelining_with_interleaving
+    self._forward_backward_test_impl(
+  File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+    loss = fwd_bwd_func(
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
+    p2p_communication.recv_forward(
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
+    input_tensor, _ = _communicate(
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+    tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+    reqs = torch.distributed.batch_isend_irecv(ops)
+  File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+    p2p_op.op(
+  File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
+    return group.recv([tensor], group_src, tag)
+torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+Last error:
+Error while creating shared memory segment /dev/shm/nccl-UrziMF (size 8257920), error: No space left on device (28)
+
+To execute this test, run the following from the base repo dir:
+    PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_with_interleaving
+
+This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+
+
+
+======================================================================
+ERROR: test_learning_pipelining_without_interleaving (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest)
+----------------------------------------------------------------------
+Traceback (most recent call last):
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 605, in wrapper
+    self._join_processes(fn)
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 845, in _join_processes
+    self._check_return_codes(elapsed_time)
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 894, in _check_return_codes
+    raise RuntimeError(error)
+RuntimeError: Process 1 exited with error code 10 and exception:
+Traceback (most recent call last):
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
+    getattr(self, test_name)()
+  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
+    fn()
+  File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
+    method(*args, **kwargs)
+  File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 239, in test_learning_pipelining_without_interleaving
+    self._forward_backward_test_impl(
+  File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
+    loss = fwd_bwd_func(
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 349, in forward_backward_pipelining_without_interleaving
+    send_forward(
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 145, in send_forward
+    p2p_communication.send_forward(
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 401, in send_forward
+    _communicate(
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
+    tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
+  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
+    reqs = torch.distributed.batch_isend_irecv(ops)
+  File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
+    p2p_op.op(
+  File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
+    return group.send([tensor], group_dst, tag)
+torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
+ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
+Last error:
+Error while creating shared memory segment /dev/shm/nccl-abqPSD (size 8257920), error: No space left on device (28)
+
+To execute this test, run the following from the base repo dir:
+    PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_without_interleaving
+
+This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+
+
+
+----------------------------------------------------------------------
+Ran 102 tests in 2937.884s
+
+FAILED (errors=8, skipped=44)
diff --git a/tests/distributed/synced_batchnorm/single_gpu_unit_test.py b/tests/distributed/synced_batchnorm/single_gpu_unit_test.py
index 446b6b0b7..93f187fb1 100644
--- a/tests/distributed/synced_batchnorm/single_gpu_unit_test.py
+++ b/tests/distributed/synced_batchnorm/single_gpu_unit_test.py
@@ -3,7 +3,8 @@
 import apex
 if True:
     print("using setup tools")
-    import syncbn
+    from apex.op_builder import SyncBnBuilder
+    syncbn = SyncBnBuilder().load()
 else:
     print("using jit")
     from torch.utils.cpp_extension import load
@@ -30,7 +31,7 @@ def compare(desc, inp1, inp2, error):
 error = 1e-5
 
 np.random.seed(1)
-dtype = np.float32
+dtype = np.float64
 inp = (np.random.randn(batch_size, feature_size, space_size, space_size)).astype(dtype)
 grad = (np.random.randn(batch_size, feature_size, space_size, space_size)).astype(dtype)
 weight = (np.random.randn(feature_size)).astype(dtype)
diff --git a/tests/distributed/synced_batchnorm/test_groups.py b/tests/distributed/synced_batchnorm/test_groups.py
index 674f8e60a..74b2a9b13 100644
--- a/tests/distributed/synced_batchnorm/test_groups.py
+++ b/tests/distributed/synced_batchnorm/test_groups.py
@@ -1,7 +1,8 @@
 import torch
 import numpy as np
 import apex
-import syncbn
+from apex.op_builder import SyncBnBuilder
+syncbn = SyncBnBuilder().load()
 import os
 import argparse
 import torch.optim as optim
diff --git a/tests/distributed/synced_batchnorm/two_gpu_unit_test.py b/tests/distributed/synced_batchnorm/two_gpu_unit_test.py
index 5daeef48a..794da411f 100644
--- a/tests/distributed/synced_batchnorm/two_gpu_unit_test.py
+++ b/tests/distributed/synced_batchnorm/two_gpu_unit_test.py
@@ -3,7 +3,8 @@
 import torch
 import numpy as np
 import apex
-import syncbn
+from apex.op_builder import SyncBnBuilder
+syncbn = SyncBnBuilder().load()
 import os
 import argparse
 import torch.optim as optim

From d6ad39837b1ec35f6839df6b943a0f7394f59410 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 3 Jul 2025 07:54:01 +0000
Subject: [PATCH 08/79] fix the unnecessary compile of syncbn module in wheel
 building due to imports in python module

---
 apex/parallel/__init__.py                        | 2 --
 apex/parallel/optimized_sync_batchnorm_kernel.py | 7 +++++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/apex/parallel/__init__.py b/apex/parallel/__init__.py
index f2368f8d8..a477c12a7 100644
--- a/apex/parallel/__init__.py
+++ b/apex/parallel/__init__.py
@@ -12,8 +12,6 @@
 # for both the cuda-enabled and python-fallback versions, and I don't want
 # to suppress the error information.
 try:
-    from apex.op_builder import SyncBnBuilder
-    syncbn = SyncBnBuilder().load()
     from .optimized_sync_batchnorm import SyncBatchNorm
 except ImportError as err:
     from .sync_batchnorm import SyncBatchNorm
diff --git a/apex/parallel/optimized_sync_batchnorm_kernel.py b/apex/parallel/optimized_sync_batchnorm_kernel.py
index 93852e099..ad0e3ba30 100644
--- a/apex/parallel/optimized_sync_batchnorm_kernel.py
+++ b/apex/parallel/optimized_sync_batchnorm_kernel.py
@@ -4,12 +4,13 @@
 from apex.op_builder import SyncBnBuilder
 from apex.parallel import ReduceOp
 
-syncbn = SyncBnBuilder().load()
-
 class SyncBatchnormFunction(Function):
 
     @staticmethod
     def forward(ctx, input, z, weight, bias, running_mean, running_variance, eps, track_running_stats = True, momentum = 1.0, process_group = None, channel_last = False, fuse_relu = False):
+        
+        syncbn = SyncBnBuilder().load()
+        
         input = input.contiguous()
         world_size = 0
 
@@ -75,6 +76,8 @@ def forward(ctx, input, z, weight, bias, running_mean, running_variance, eps, tr
 
     @staticmethod
     def backward(ctx, grad_output):
+        syncbn = SyncBnBuilder().load()
+
         grad_output = grad_output.contiguous()
         # mini batch mean & var are calculated by forward path.
         # mu = 1./N*np.sum(h, axis = 0)

From 497f54a791cb66dda2a1158bc05693e9890d2f31 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 3 Jul 2025 08:14:13 +0000
Subject: [PATCH 09/79] add fused layer norm module to jit build

---
 apex/normalization/fused_layer_norm.py | 25 +++++++++---------
 op_builder/fused_layer_norm.py         | 35 ++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 12 deletions(-)
 create mode 100644 op_builder/fused_layer_norm.py

diff --git a/apex/normalization/fused_layer_norm.py b/apex/normalization/fused_layer_norm.py
index 0c7bd2e09..493271577 100644
--- a/apex/normalization/fused_layer_norm.py
+++ b/apex/normalization/fused_layer_norm.py
@@ -8,6 +8,7 @@
 from typing import List, Tuple
 
 from apex._autocast_utils import _cast_if_autocast_enabled
+from apex.op_builder import FusedLayerNormBuilder
 
 global fused_layer_norm_cuda
 fused_layer_norm_cuda = None
@@ -40,7 +41,7 @@ class FusedLayerNormAffineFunction(torch.autograd.Function):
     def forward(ctx, input, weight, bias, normalized_shape, eps, memory_efficient=False):
         global fused_layer_norm_cuda
         if fused_layer_norm_cuda is None:
-            fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
+            fused_layer_norm_cuda = FusedLayerNormBuilder().load()
         ctx.normalized_shape = normalized_shape
         ctx.eps = eps
         ctx.memory_efficient = memory_efficient
@@ -80,7 +81,7 @@ def fused_layer_norm_affine_fwd(
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         global fused_layer_norm_cuda
         if fused_layer_norm_cuda is None:
-            fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
+            fused_layer_norm_cuda = FusedLayerNormBuilder().load()
 
         input_ = input.contiguous()
         weight_ = weight.contiguous()
@@ -197,7 +198,7 @@ class FusedRMSNormAffineFunction(torch.autograd.Function):
     def forward(ctx, input, weight, normalized_shape, eps, memory_efficient=False):
         global fused_layer_norm_cuda
         if fused_layer_norm_cuda is None:
-            fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
+            fused_layer_norm_cuda = FusedLayerNormBuilder().load()
         ctx.normalized_shape = normalized_shape
         ctx.eps = eps
         ctx.memory_efficient = memory_efficient
@@ -232,7 +233,7 @@ def fused_rms_norm_affine_fwd(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         global fused_layer_norm_cuda
         if fused_layer_norm_cuda is None:
-            fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
+            fused_layer_norm_cuda = FusedLayerNormBuilder().load()
 
         input_ = input.contiguous()
         weight_ = weight.contiguous()
@@ -350,7 +351,7 @@ class FusedLayerNormAffineMixedDtypesFunction(FusedLayerNormAffineFunction):
     def forward(ctx, input, weight, bias, normalized_shape, eps, memory_efficient=False):
         global fused_layer_norm_cuda
         if fused_layer_norm_cuda is None:
-            fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
+            fused_layer_norm_cuda = FusedLayerNormBuilder().load()
         ctx.normalized_shape = normalized_shape
         ctx.eps = eps
         ctx.memory_efficient = memory_efficient
@@ -373,7 +374,7 @@ class FusedRMSNormAffineMixedDtypesFunction(FusedRMSNormAffineFunction):
     def forward(ctx, input, weight, normalized_shape, eps, memory_efficient=False):
         global fused_layer_norm_cuda
         if fused_layer_norm_cuda is None:
-            fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
+            fused_layer_norm_cuda = FusedLayerNormBuilder().load()
         ctx.normalized_shape = normalized_shape
         ctx.eps = eps
         ctx.memory_efficient = memory_efficient
@@ -394,7 +395,7 @@ class FusedLayerNormFunction(torch.autograd.Function):
     def forward(ctx, input, normalized_shape, eps, memory_efficient=False):
         global fused_layer_norm_cuda
         if fused_layer_norm_cuda is None:
-            fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
+            fused_layer_norm_cuda = FusedLayerNormBuilder().load()
         ctx.normalized_shape = normalized_shape
         ctx.eps = eps
         ctx.memory_efficient = memory_efficient
@@ -427,7 +428,7 @@ def fused_layer_norm_fwd(
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         global fused_layer_norm_cuda
         if fused_layer_norm_cuda is None:
-            fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
+            fused_layer_norm_cuda = FusedLayerNormBuilder().load()
 
         input_ = input.contiguous()
         output, mean, invvar = fused_layer_norm_cuda.forward(
@@ -525,7 +526,7 @@ class FusedRMSNormFunction(torch.autograd.Function):
     def forward(ctx, input, normalized_shape, eps, memory_efficient=False):
         global fused_layer_norm_cuda
         if fused_layer_norm_cuda is None:
-            fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
+            fused_layer_norm_cuda = FusedLayerNormBuilder().load()
         ctx.normalized_shape = normalized_shape
         ctx.eps = eps
         ctx.memory_efficient = memory_efficient
@@ -558,7 +559,7 @@ def fused_rms_norm_fwd(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         global fused_layer_norm_cuda
         if fused_layer_norm_cuda is None:
-            fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
+            fused_layer_norm_cuda = FusedLayerNormBuilder().load()
 
         input_ = input.contiguous()
         output, invvar = fused_layer_norm_cuda.rms_forward(
@@ -773,7 +774,7 @@ def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True, memory_e
         super().__init__()
 
         global fused_layer_norm_cuda
-        fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
+        fused_layer_norm_cuda = FusedLayerNormBuilder().load()
 
         if isinstance(normalized_shape, numbers.Integral):
             normalized_shape = (normalized_shape,)
@@ -872,7 +873,7 @@ def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True, memory_e
         super().__init__()
 
         global fused_layer_norm_cuda
-        fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
+        fused_layer_norm_cuda = FusedLayerNormBuilder().load()
 
         if isinstance(normalized_shape, numbers.Integral):
             normalized_shape = (normalized_shape,)
diff --git a/op_builder/fused_layer_norm.py b/op_builder/fused_layer_norm.py
new file mode 100644
index 000000000..a078c9c27
--- /dev/null
+++ b/op_builder/fused_layer_norm.py
@@ -0,0 +1,35 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class FusedLayerNormBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'DS_BUILD_FUSED_LAYER_NORM'
+    NAME = "fused_layer_norm_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ['csrc/layer_norm_cuda.cpp', 'csrc/layer_norm_cuda_kernel.cu']
+
+    def include_paths(self):
+        return ['csrc']
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = ['-O3'] + self.version_dependent_macros()
+        if self.is_rocm_pytorch():
+            ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
+            nvcc_flags += ['-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR, '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR]
+        else:
+            nvcc_flags.extend(
+                ['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math', '-maxrregcount=50'] +
+                self.compute_capability_args())
+        return nvcc_flags
\ No newline at end of file

From 12222ebe10627f67b362f316c9c81e38aa317f2a Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 3 Jul 2025 10:07:22 +0000
Subject: [PATCH 10/79] make focal loss module as jit module

---
 MANIFEST.in                           | 2 ++
 apex/contrib/focal_loss/focal_loss.py | 5 ++---
 contrib/csrc                          | 1 +
 setup.py                              | 3 ---
 4 files changed, 5 insertions(+), 6 deletions(-)
 create mode 100644 MANIFEST.in
 create mode 120000 contrib/csrc

diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 000000000..fb61d7cb6
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,2 @@
+recursive-include contrib/csrc *
+recursive-include csrc *
\ No newline at end of file
diff --git a/apex/contrib/focal_loss/focal_loss.py b/apex/contrib/focal_loss/focal_loss.py
index 85c6f620e..a63c43103 100644
--- a/apex/contrib/focal_loss/focal_loss.py
+++ b/apex/contrib/focal_loss/focal_loss.py
@@ -1,7 +1,6 @@
 import torch
-
-import focal_loss_cuda
-
+from apex.op_builder import FocalLossBuilder
+focal_loss_cuda = FocalLossBuilder().load()
 
 class FocalLoss(torch.autograd.Function):
     @staticmethod
diff --git a/contrib/csrc b/contrib/csrc
new file mode 120000
index 000000000..4e941d8b2
--- /dev/null
+++ b/contrib/csrc
@@ -0,0 +1 @@
+../apex/contrib/csrc
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 96b2d8a1d..42f9849fc 100644
--- a/setup.py
+++ b/setup.py
@@ -410,8 +410,5 @@ def op_enabled(op_name):
     cmdclass={'build_ext': BuildExtension} if ext_modules2 else {},
     extras_require=extras,
     install_requires=required,
-    package_data={
-        "apex": ["csrc/**/*", "csrc/*"],  # include all files in csrc/
-    },
 )
 

From 1a72cb0dd4158b5f5c940c9b15098b3929d6b61b Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 3 Jul 2025 10:51:30 +0000
Subject: [PATCH 11/79] make focal loss module as jit module

---
 op_builder/focal_loss.py | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 op_builder/focal_loss.py

diff --git a/op_builder/focal_loss.py b/op_builder/focal_loss.py
new file mode 100644
index 000000000..1bc2f87b6
--- /dev/null
+++ b/op_builder/focal_loss.py
@@ -0,0 +1,36 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class FocalLossBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'DS_BUILD_FOCAL_LOSS'
+    NAME = "focal_loss_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ['contrib/csrc/focal_loss/focal_loss_cuda.cpp',
+                'contrib/csrc/focal_loss/focal_loss_cuda_kernel.cu']
+
+    def include_paths(self):
+        return ['contrib/csrc/' ]
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = ['-O3'] + self.version_dependent_macros()
+        if self.is_rocm_pytorch():
+            ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
+            nvcc_flags += ['-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR, '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR]
+        else:
+            nvcc_flags.extend(
+                ['-allow-unsupported-compiler' if sys.platform == "win32" else '', '--ftz=false', '--use_fast_math'] +
+                self.compute_capability_args())
+        return nvcc_flags
\ No newline at end of file

From 5ee61157992201978b6706d00f981d25e29613f2 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 3 Jul 2025 10:52:19 +0000
Subject: [PATCH 12/79] make xentropy module as jit module

---
 apex/contrib/xentropy/__init__.py         |  4 +++-
 apex/contrib/xentropy/softmax_xentropy.py |  4 +++-
 op_builder/xentropy.py                    | 29 +++++++++++++++++++++++
 3 files changed, 35 insertions(+), 2 deletions(-)
 create mode 100644 op_builder/xentropy.py

diff --git a/apex/contrib/xentropy/__init__.py b/apex/contrib/xentropy/__init__.py
index 7dff6a27a..dcff69d53 100644
--- a/apex/contrib/xentropy/__init__.py
+++ b/apex/contrib/xentropy/__init__.py
@@ -1,6 +1,8 @@
 try:
     import torch
-    import xentropy_cuda
+    from apex.op_builder import XentropyBuilder
+    xentropy_cuda = XentropyBuilder().load()
+    
     from .softmax_xentropy import SoftmaxCrossEntropyLoss
     del torch
     del xentropy_cuda
diff --git a/apex/contrib/xentropy/softmax_xentropy.py b/apex/contrib/xentropy/softmax_xentropy.py
index 33fbf8b21..4a8f97f3c 100644
--- a/apex/contrib/xentropy/softmax_xentropy.py
+++ b/apex/contrib/xentropy/softmax_xentropy.py
@@ -1,5 +1,7 @@
 import torch
-import xentropy_cuda
+from apex.op_builder import XentropyBuilder
+
+xentropy_cuda = XentropyBuilder().load()
 
 class SoftmaxCrossEntropyLoss(torch.autograd.Function):
     @staticmethod
diff --git a/op_builder/xentropy.py b/op_builder/xentropy.py
new file mode 100644
index 000000000..cf098d192
--- /dev/null
+++ b/op_builder/xentropy.py
@@ -0,0 +1,29 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class XentropyBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'DS_BUILD_XENTROPY'
+    NAME = "xentropy_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ['contrib/csrc/xentropy/interface.cpp',
+                'contrib/csrc/xentropy/xentropy_kernel.cu']
+
+    def include_paths(self):
+        return ['csrc', 'contrib/csrc/' ]
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = ['-O3'] + self.version_dependent_macros() 
+        return nvcc_flags
\ No newline at end of file

From 6533731102b7c7e8e0fa0a2275d336122122c590 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 3 Jul 2025 14:01:24 +0000
Subject: [PATCH 13/79] make bpn module as jit module

---
 apex/contrib/groupbn/__init__.py   |  3 ++-
 apex/contrib/groupbn/batch_norm.py |  3 ++-
 op_builder/bnp.py                  | 33 ++++++++++++++++++++++++++++++
 3 files changed, 37 insertions(+), 2 deletions(-)
 create mode 100644 op_builder/bnp.py

diff --git a/apex/contrib/groupbn/__init__.py b/apex/contrib/groupbn/__init__.py
index 2f8577066..9ab407c01 100644
--- a/apex/contrib/groupbn/__init__.py
+++ b/apex/contrib/groupbn/__init__.py
@@ -1,6 +1,7 @@
 try:
     import torch
-    import bnp
+    from apex.op_builder import BnpBuilder
+    bnp = BnpBuilder().load()
     from .batch_norm import BatchNorm2d_NHWC
     del torch
     del bnp
diff --git a/apex/contrib/groupbn/batch_norm.py b/apex/contrib/groupbn/batch_norm.py
index af0b7e9b2..b7bc79676 100644
--- a/apex/contrib/groupbn/batch_norm.py
+++ b/apex/contrib/groupbn/batch_norm.py
@@ -2,7 +2,8 @@
 import numpy as np
 from torch.nn.modules.batchnorm import _BatchNorm
 
-import bnp
+from apex.op_builder import BnpBuilder
+bnp = BnpBuilder().load()
 
 def check_if_rocm_pytorch():
     is_rocm_pytorch = False
diff --git a/op_builder/bnp.py b/op_builder/bnp.py
new file mode 100644
index 000000000..75a30c34c
--- /dev/null
+++ b/op_builder/bnp.py
@@ -0,0 +1,33 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class BnpBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'DS_BUILD_BNP'
+    NAME = "bnp"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ['contrib/csrc/groupbn/batch_norm.cu',
+                'contrib/csrc/groupbn/ipc.cu',
+                'contrib/csrc/groupbn/interface.cpp',
+                'contrib/csrc/groupbn/batch_norm_add_relu.cu']
+
+    def include_paths(self):
+        return ['contrib/csrc', 'csrc']
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        return ['-DCUDA_HAS_FP16=1',
+                    '-D__CUDA_NO_HALF_OPERATORS__',
+                    '-D__CUDA_NO_HALF_CONVERSIONS__',
+                    '-D__CUDA_NO_HALF2_OPERATORS__'] + self.version_dependent_macros()
\ No newline at end of file

From 4a1a8f82c51100a58312f3a1ebeb28efa55f6992 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 17 Jul 2025 05:07:25 +0000
Subject: [PATCH 14/79] add code to build individual extensions without JIT

---
 MANIFEST.in                                   |  4 +-
 accelerator/abstract_accelerator.py           |  2 +-
 accelerator/cpu_accelerator.py                | 16 ++---
 accelerator/cuda_accelerator.py               |  6 +-
 accelerator/real_accelerator.py               | 69 ++++---------------
 op_builder/all_ops.py                         |  2 +-
 op_builder/bnp.py                             |  2 +-
 op_builder/builder.py                         | 44 ++++++------
 op_builder/focal_loss.py                      |  2 +-
 op_builder/fused_bias_swiglu.py               |  2 +-
 op_builder/fused_dense.py                     |  2 +-
 op_builder/fused_layer_norm.py                |  2 +-
 op_builder/fused_rope.py                      |  2 +-
 .../generic_scaled_masked_softmax_cuda.py     |  2 +-
 op_builder/scaled_masked_softmax_cuda.py      |  2 +-
 op_builder/scaled_softmax_cuda.py             |  2 +-
 ...scaled_upper_triang_masked_softmax_cuda.py |  2 +-
 op_builder/syncbn.py                          |  2 +-
 op_builder/xentropy.py                        |  2 +-
 setup.py                                      | 17 ++---
 20 files changed, 70 insertions(+), 114 deletions(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index fb61d7cb6..a5dc0456c 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,2 +1,2 @@
-recursive-include contrib/csrc *
-recursive-include csrc *
\ No newline at end of file
+recursive-include apex/contrib/csrc *
+recursive-include apex/csrc *
\ No newline at end of file
diff --git a/accelerator/abstract_accelerator.py b/accelerator/abstract_accelerator.py
index e8064de4a..ecce91c48 100644
--- a/accelerator/abstract_accelerator.py
+++ b/accelerator/abstract_accelerator.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
-# DeepSpeed Team
+# Taken from DeepSpeed 
 
 import abc
 from abc import ABC
diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py
index 5bd66926d..e96b3c5d1 100644
--- a/accelerator/cpu_accelerator.py
+++ b/accelerator/cpu_accelerator.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
-# DeepSpeed Team
+# Taken from DeepSpeed
 
 from .abstract_accelerator import ApexAccelerator
 
@@ -72,7 +72,7 @@ def device_count(self):
         if device_count > 0:
             return device_count
         else:
-            from deepspeed.utils.numa import get_numa_cores
+            from apex.utils.numa import get_numa_cores
             # Count NUMA node for number of cpu accelerators. On machine with HBM
             # In flat mode, HBM is in separate NUMA node with no cores on this node.
             # Ignore these NUMA nodes with no cores.
@@ -120,7 +120,7 @@ def Stream(self):
         return None
 
     def stream(self, stream):
-        from deepspeed.runtime.utils import noop_context
+        from apex.runtime.utils import noop_context
         return noop_context()
 
     def current_stream(self, device_index=None):
@@ -246,7 +246,7 @@ def create_graph(self):
         return None
 
     def capture_to_graph(self, graph, pool=None, stream=None):
-        from deepspeed.runtime.utils import noop_context
+        from apex.runtime.utils import noop_context
         return noop_context()
 
     def replay_graph(self, graph):
@@ -289,7 +289,7 @@ def is_pinned(self, tensor):
 
     def op_builder_dir(self):
         try:
-            # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+            # is op_builder from apex or a 3p version? this should only succeed if it's apex
             # if successful this also means we're doing a local install and not JIT compile path
             from op_builder import __apex__  # noqa: F401 # type: ignore
             return "op_builder.cpu"
@@ -313,12 +313,12 @@ def create_op_builder(self, op_name):
     # return an op builder class, name specified by class_name
     def get_op_builder(self, class_name):
         try:
-            # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+            # is op_builder from apex or a 3p version? this should only succeed if it's apex
             # if successful this also means we're doing a local install and not JIT compile path
-            from op_builder import __deepspeed__  # noqa: F401 # type: ignore
+            from op_builder import __apex__  # noqa: F401 # type: ignore
             from op_builder.cpu import AsyncIOBuilder, CCLCommBuilder, ShareMemCommBuilder, FusedAdamBuilder, CPUAdamBuilder, NotImplementedBuilder
         except ImportError:
-            from deepspeed.ops.op_builder.cpu import AsyncIOBuilder, CCLCommBuilder, ShareMemCommBuilder, FusedAdamBuilder, CPUAdamBuilder, NotImplementedBuilder
+            from apex.ops.op_builder.cpu import AsyncIOBuilder, CCLCommBuilder, ShareMemCommBuilder, FusedAdamBuilder, CPUAdamBuilder, NotImplementedBuilder
 
         if class_name == "CCLCommBuilder":
             return CCLCommBuilder
diff --git a/accelerator/cuda_accelerator.py b/accelerator/cuda_accelerator.py
index 48dacb83b..a9dac1c5e 100644
--- a/accelerator/cuda_accelerator.py
+++ b/accelerator/cuda_accelerator.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
-# DeepSpeed Team
+# Taken from DeepSpeed
 
 import functools
 import os
@@ -204,7 +204,7 @@ def is_fp16_supported(self):
             return True
         # See https://docs.nvidia.com/deeplearning/tensorrt/support-matrix/index.html#hardware-precision-matrix
         # FP16 on compute capability 6.x is deprecated
-        allow_deprecated_fp16 = os.environ.get('DS_ALLOW_DEPRECATED_FP16', '0') == '1'
+        allow_deprecated_fp16 = os.environ.get('APEX_ALLOW_DEPRECATED_FP16', '0') == '1'
         major, _ = torch.cuda.get_device_capability()
         if major >= 7:
             return True
@@ -307,7 +307,7 @@ def on_accelerator(self, tensor):
 
     def op_builder_dir(self):
         try:
-            # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+            # is op_builder from apex or a 3p version? this should only succeed if it's apex
             # if successful this also means we're doing a local install and not JIT compile path
             from op_builder import __apex__  # noqa: F401 # type: ignore
             return "op_builder"
diff --git a/accelerator/real_accelerator.py b/accelerator/real_accelerator.py
index aaba7cf23..59cdc68a7 100644
--- a/accelerator/real_accelerator.py
+++ b/accelerator/real_accelerator.py
@@ -1,13 +1,13 @@
 # Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
-# DeepSpeed Team
+# Taken from DeepSpeed
 import os
 
 try:
     # Importing logger currently requires that torch is installed, hence the try...except
     # TODO: Remove logger dependency on torch.
-    from deepspeed.utils import logger as accel_logger
+    from apex.utils import logger as accel_logger
 except ImportError as e:
     accel_logger = None
 
@@ -28,13 +28,13 @@
 def _validate_accelerator(accel_obj):
     # because abstract_accelerator has different path during
     # build time (accelerator.abstract_accelerator)
-    # and run time (deepspeed.accelerator.abstract_accelerator)
+    # and run time (apex.accelerator.abstract_accelerator)
     # and extension would import the
-    # run time abstract_accelerator/DeepSpeedAccelerator as its base
+    # run time abstract_accelerator/apex as its base
     # class, so we need to compare accel_obj with both base class.
-    # if accel_obj is instance of DeepSpeedAccelerator in one of
+    # if accel_obj is instance of ApexAccelerator in one of
     # accelerator.abstractor_accelerator
-    # or deepspeed.accelerator.abstract_accelerator, consider accel_obj
+    # or apex.accelerator.abstract_accelerator, consider accel_obj
     # is a conforming object
     if not ((dsa1 is not None and isinstance(accel_obj, dsa1)) or (dsa2 is not None and isinstance(accel_obj, dsa2))):
         raise AssertionError(f"{accel_obj.__class__.__name__} accelerator is not subclass of ApexAccelerator")
@@ -55,13 +55,13 @@ def get_accelerator():
 
     accelerator_name = None
     ds_set_method = None
-    # 1. Detect whether there is override of DeepSpeed accelerators from environment variable.
-    if "DS_ACCELERATOR" in os.environ.keys():
-        accelerator_name = os.environ["DS_ACCELERATOR"]
+    # 1. Detect whether there is override of apex accelerators from environment variable.
+    if "APEX_ACCELERATOR" in os.environ.keys():
+        accelerator_name = os.environ["APEX_ACCELERATOR"]
         if accelerator_name == "cpu":
             pass 
         elif accelerator_name not in SUPPORTED_ACCELERATOR_LIST:
-            raise ValueError(f'DS_ACCELERATOR must be one of {SUPPORTED_ACCELERATOR_LIST}. '
+            raise ValueError(f'APEX_ACCELERATOR must be one of {SUPPORTED_ACCELERATOR_LIST}. '
                              f'Value "{accelerator_name}" is not supported')
         ds_set_method = "override"
 
@@ -114,7 +114,7 @@ def get_accelerator():
         ds_accelerator = CPU_Accelerator()
     _validate_accelerator(ds_accelerator)
     if accel_logger is not None:
-        accel_logger.info(f"Setting ds_accelerator to {ds_accelerator._name} ({ds_set_method})")
+        accel_logger.info(f"Setting apex_accelerator to {ds_accelerator._name} ({ds_set_method})")
     return ds_accelerator
 
 
@@ -122,50 +122,5 @@ def set_accelerator(accel_obj):
     global ds_accelerator
     _validate_accelerator(accel_obj)
     if accel_logger is not None:
-        accel_logger.info(f"Setting ds_accelerator to {accel_obj._name} (model specified)")
+        accel_logger.info(f"Setting apex_accelerator to {accel_obj._name} (model specified)")
     ds_accelerator = accel_obj
-
-
-"""
------------[code] test_get.py -----------
-from deepspeed.accelerator import get_accelerator
-my_accelerator = get_accelerator()
-logger.info(f'{my_accelerator._name=}')
-logger.info(f'{my_accelerator._communication_backend=}')
-logger.info(f'{my_accelerator.HalfTensor().device=}')
-logger.info(f'{my_accelerator.total_memory()=}')
------------[code] test_get.py -----------
-
----[output] python test_get.py---------
-my_accelerator.name()='cuda'
-my_accelerator.communication_backend='nccl'
-my_accelerator.HalfTensor().device=device(type='cuda', index=0)
-my_accelerator.total_memory()=34089730048
----[output] python test_get.py---------
-
-**************************************************************************
------------[code] test_set.py -----------
-from deepspeed.accelerator.cuda_accelerator import CUDA_Accelerator
-cu_accel = CUDA_Accelerator()
-logger.info(f'{id(cu_accel)=}')
-from deepspeed.accelerator import set_accelerator, get_accelerator
-set_accelerator(cu_accel)
-
-my_accelerator = get_accelerator()
-logger.info(f'{id(my_accelerator)=}')
-logger.info(f'{my_accelerator._name=}')
-logger.info(f'{my_accelerator._communication_backend=}')
-logger.info(f'{my_accelerator.HalfTensor().device=}')
-logger.info(f'{my_accelerator.total_memory()=}')
------------[code] test_set.py -----------
-
-
----[output] python test_set.py---------
-id(cu_accel)=139648165478304
-my_accelerator=<deepspeed.accelerator.cuda_accelerator.CUDA_Accelerator object at 0x7f025f4bffa0>
-my_accelerator.name='cuda'
-my_accelerator.communication_backend='nccl'
-my_accelerator.HalfTensor().device=device(type='cuda', index=0)
-my_accelerator.total_memory()=34089730048
----[output] python test_set.py---------
-"""
\ No newline at end of file
diff --git a/op_builder/all_ops.py b/op_builder/all_ops.py
index 66b34d5bc..7c12e50d3 100644
--- a/op_builder/all_ops.py
+++ b/op_builder/all_ops.py
@@ -2,7 +2,7 @@
 import pkgutil
 import importlib
 try:
-    # during installation time accelerator is visible, otherwise return deepspeed.accelerator
+    # during installation time accelerator is visible, otherwise return apex.accelerator
     from accelerator import get_accelerator
 except ImportError:
     from apex.accelerator import get_accelerator
diff --git a/op_builder/bnp.py b/op_builder/bnp.py
index 75a30c34c..9d8871df2 100644
--- a/op_builder/bnp.py
+++ b/op_builder/bnp.py
@@ -4,7 +4,7 @@
 
 
 class BnpBuilder(CUDAOpBuilder):
-    BUILD_VAR = 'DS_BUILD_BNP'
+    BUILD_VAR = 'APEX_BUILD_BNP'
     NAME = "bnp"
 
     def __init__(self):
diff --git a/op_builder/builder.py b/op_builder/builder.py
index 1fd7a4504..1c34e7652 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
-# DeepSpeed Team
+# Taken from DeepSpeed
 
 import os
 import re
@@ -30,7 +30,7 @@
 try:
     import torch
 except ImportError:
-    print(f"{WARNING} unable to import torch, please install it if you want to pre-compile any deepspeed ops.")
+    print(f"{WARNING} unable to import torch, please install it if you want to pre-compile any apex ops.")
 else:
     TORCH_MAJOR = int(torch.__version__.split('.')[0])
     TORCH_MINOR = int(torch.__version__.split('.')[1])
@@ -79,7 +79,7 @@ def get_default_compute_capabilities():
 
 
 # list compatible minor CUDA versions - so that for example pytorch built with cuda-11.0 can be used
-# to build deepspeed and system-wide installed cuda 11.2
+# to build apex and system-wide installed cuda 11.2
 cuda_minor_mismatch_ok = {
     10: ["10.0", "10.1", "10.2"],
     11: ["11.0", "11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8"],
@@ -100,15 +100,15 @@ def assert_no_cuda_mismatch(name=""):
                   f"version torch was compiled with {torch.version.cuda} "
                   "but since the APIs are compatible, accepting this combination")
             return True
-        elif os.getenv("DS_SKIP_CUDA_CHECK", "0") == "1":
+        elif os.getenv("APEX_SKIP_CUDA_CHECK", "0") == "1":
             print(
-                f"{WARNING} DeepSpeed Op Builder: Installed CUDA version {sys_cuda_version} does not match the "
+                f"{WARNING} Apex Op Builder: Installed CUDA version {sys_cuda_version} does not match the "
                 f"version torch was compiled with {torch.version.cuda}."
-                "Detected `DS_SKIP_CUDA_CHECK=1`: Allowing this combination of CUDA, but it may result in unexpected behavior."
+                "Detected `APEX_SKIP_CUDA_CHECK=1`: Allowing this combination of CUDA, but it may result in unexpected behavior."
             )
             return True
         raise CUDAMismatchException(
-            f">- DeepSpeed Op Builder: Installed CUDA version {sys_cuda_version} does not match the "
+            f">- Apex Op Builder: Installed CUDA version {sys_cuda_version} does not match the "
             f"version torch was compiled with {torch.version.cuda}, unable to compile "
             "cuda/cpp extensions without a matching cuda version.")
     return True
@@ -132,15 +132,15 @@ def __init__(self, name):
     @abstractmethod
     def absolute_name(self):
         '''
-        Returns absolute build path for cases where the op is pre-installed, e.g., deepspeed.ops.adam.cpu_adam
-        will be installed as something like: deepspeed/ops/adam/cpu_adam.so
+        Returns absolute build path for cases where the op is pre-installed, e.g., apex.ops.adam.cpu_adam
+        will be installed as something like: apex/ops/adam/cpu_adam.so
         '''
         pass
 
     @abstractmethod
     def sources(self):
         '''
-        Returns list of source files for your op, relative to root of deepspeed package (i.e., DeepSpeed/deepspeed)
+        Returns list of source files for your op, relative to root of apex package
         '''
         pass
 
@@ -155,9 +155,9 @@ def validate_torch_version(torch_info):
         install_torch_version = torch_info['version']
         current_torch_version = ".".join(torch.__version__.split('.')[:2])
         if install_torch_version != current_torch_version:
-            raise RuntimeError("PyTorch version mismatch! DeepSpeed ops were compiled and installed "
+            raise RuntimeError("PyTorch version mismatch! apex ops were compiled and installed "
                                "with a different version than what is being used at runtime. "
-                               f"Please re-install DeepSpeed or switch torch versions. "
+                               f"Please re-install apex or switch torch versions. "
                                f"Install torch version={install_torch_version}, "
                                f"Runtime torch version={current_torch_version}")
 
@@ -167,18 +167,18 @@ def validate_torch_op_version(torch_info):
             current_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
             install_cuda_version = torch_info['cuda_version']
             if install_cuda_version != current_cuda_version:
-                raise RuntimeError("CUDA version mismatch! DeepSpeed ops were compiled and installed "
+                raise RuntimeError("CUDA version mismatch! apex ops were compiled and installed "
                                    "with a different version than what is being used at runtime. "
-                                   f"Please re-install DeepSpeed or switch torch versions. "
+                                   f"Please re-install apex or switch torch versions. "
                                    f"Install CUDA version={install_cuda_version}, "
                                    f"Runtime CUDA version={current_cuda_version}")
         else:
             current_hip_version = ".".join(torch.version.hip.split('.')[:2])
             install_hip_version = torch_info['hip_version']
             if install_hip_version != current_hip_version:
-                raise RuntimeError("HIP version mismatch! DeepSpeed ops were compiled and installed "
+                raise RuntimeError("HIP version mismatch! apex ops were compiled and installed "
                                    "with a different version than what is being used at runtime. "
-                                   f"Please re-install DeepSpeed or switch torch versions. "
+                                   f"Please re-install apex or switch torch versions. "
                                    f"Install HIP version={install_hip_version}, "
                                    f"Runtime HIP version={current_hip_version}")
 
@@ -288,7 +288,7 @@ def get_rocm_wavefront_size():
 
     def include_paths(self):
         '''
-        Returns list of include paths, relative to root of deepspeed package (i.e., DeepSpeed/deepspeed)
+        Returns list of include paths, relative to root of apex package
         '''
         return []
 
@@ -505,7 +505,7 @@ def warning(self, msg):
         self.error_log = f"{msg}"
         print(f"{WARNING} {msg}")
 
-    def deepspeed_src_path(self, code_path):
+    def apex_src_path(self, code_path):
         if os.path.isabs(code_path):
             return code_path
         else:
@@ -556,8 +556,8 @@ def jit_load(self, verbose=True):
         from torch.utils.cpp_extension import load
 
         start_build = time.time()
-        sources = [os.path.abspath(self.deepspeed_src_path(path)) for path in self.sources()]
-        extra_include_paths = [os.path.abspath(self.deepspeed_src_path(path)) for path in self.include_paths()]
+        sources = [os.path.abspath(self.apex_src_path(path)) for path in self.sources()]
+        extra_include_paths = [os.path.abspath(self.apex_src_path(path)) for path in self.include_paths()]
 
         # Torch will try and apply whatever CCs are in the arch list at compile time,
         # we have already set the intended targets ourselves we know that will be
@@ -778,7 +778,7 @@ def nvcc_args(self):
             ]
         else:
             try:
-                nvcc_threads = int(os.getenv("DS_NVCC_THREADS", ""))
+                nvcc_threads = int(os.getenv("APEX_NVCC_THREADS", ""))
                 if nvcc_threads <= 0:
                     raise ValueError("")
             except ValueError:
@@ -797,7 +797,7 @@ def nvcc_args(self):
                 '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__',
                 f'--threads={nvcc_threads}'
             ]
-            if os.environ.get('DS_DEBUG_CUDA_BUILD', '0') == '1':
+            if os.environ.get('APEX_DEBUG_CUDA_BUILD', '0') == '1':
                 args.append('--ptxas-options=-v')
             args += self.compute_capability_args()
         return args
diff --git a/op_builder/focal_loss.py b/op_builder/focal_loss.py
index 1bc2f87b6..27693284b 100644
--- a/op_builder/focal_loss.py
+++ b/op_builder/focal_loss.py
@@ -4,7 +4,7 @@
 
 
 class FocalLossBuilder(CUDAOpBuilder):
-    BUILD_VAR = 'DS_BUILD_FOCAL_LOSS'
+    BUILD_VAR = 'APEX_BUILD_FOCAL_LOSS'
     NAME = "focal_loss_cuda"
 
     def __init__(self):
diff --git a/op_builder/fused_bias_swiglu.py b/op_builder/fused_bias_swiglu.py
index adc729e6f..34ac494b5 100644
--- a/op_builder/fused_bias_swiglu.py
+++ b/op_builder/fused_bias_swiglu.py
@@ -3,7 +3,7 @@
 import os
 
 class FusedBiasSwiGLUBuilder(CUDAOpBuilder):
-    BUILD_VAR = 'DS_BUILD_FUSED_BIAS_SWIGLU'
+    BUILD_VAR = 'APEX_BUILD_FUSED_BIAS_SWIGLU'
     NAME = "fused_bias_swiglu"
 
     def __init__(self):
diff --git a/op_builder/fused_dense.py b/op_builder/fused_dense.py
index a4c7b25f6..74c8067b4 100644
--- a/op_builder/fused_dense.py
+++ b/op_builder/fused_dense.py
@@ -4,7 +4,7 @@
 
 
 class FusedDenseBuilder(CUDAOpBuilder):
-    BUILD_VAR = 'DS_BUILD_FUSED_DENSE'
+    BUILD_VAR = 'APEX_BUILD_FUSED_DENSE'
     NAME = "fused_dense_cuda"
 
     def __init__(self):
diff --git a/op_builder/fused_layer_norm.py b/op_builder/fused_layer_norm.py
index a078c9c27..3572166c7 100644
--- a/op_builder/fused_layer_norm.py
+++ b/op_builder/fused_layer_norm.py
@@ -4,7 +4,7 @@
 
 
 class FusedLayerNormBuilder(CUDAOpBuilder):
-    BUILD_VAR = 'DS_BUILD_FUSED_LAYER_NORM'
+    BUILD_VAR = 'APEX_BUILD_FUSED_LAYER_NORM'
     NAME = "fused_layer_norm_cuda"
 
     def __init__(self):
diff --git a/op_builder/fused_rope.py b/op_builder/fused_rope.py
index 5237830b2..c72501fee 100644
--- a/op_builder/fused_rope.py
+++ b/op_builder/fused_rope.py
@@ -4,7 +4,7 @@
 
 
 class FusedRopeBuilder(CUDAOpBuilder):
-    BUILD_VAR = 'DS_BUILD_FUSED_ROPE'
+    BUILD_VAR = 'APEX_BUILD_FUSED_ROPE'
     NAME = "fused_rotary_positional_embedding"
 
     def __init__(self):
diff --git a/op_builder/generic_scaled_masked_softmax_cuda.py b/op_builder/generic_scaled_masked_softmax_cuda.py
index 322d623f0..ef2a7978e 100644
--- a/op_builder/generic_scaled_masked_softmax_cuda.py
+++ b/op_builder/generic_scaled_masked_softmax_cuda.py
@@ -1,7 +1,7 @@
 from .builder import CUDAOpBuilder
 
 class GenericScaledMaskedSoftmaxCudaBuilder(CUDAOpBuilder):
-    BUILD_VAR = 'DS_BUILD_GENERIC_SCALED_MASKED_SOFTMAX_CUDA'
+    BUILD_VAR = 'APEX_BUILD_GENERIC_SCALED_MASKED_SOFTMAX_CUDA'
     NAME = "generic_scaled_masked_softmax_cuda"
 
     def __init__(self):
diff --git a/op_builder/scaled_masked_softmax_cuda.py b/op_builder/scaled_masked_softmax_cuda.py
index eddbf0969..4a272646b 100644
--- a/op_builder/scaled_masked_softmax_cuda.py
+++ b/op_builder/scaled_masked_softmax_cuda.py
@@ -1,7 +1,7 @@
 from .builder import CUDAOpBuilder
 
 class ScaledMaskedSoftmaxCudaBuilder(CUDAOpBuilder):
-    BUILD_VAR = 'DS_BUILD_SCALED_MASKED_SOFTMAX_CUDA'
+    BUILD_VAR = 'APEX_BUILD_SCALED_MASKED_SOFTMAX_CUDA'
     NAME = "scaled_masked_softmax_cuda"
 
     def __init__(self):
diff --git a/op_builder/scaled_softmax_cuda.py b/op_builder/scaled_softmax_cuda.py
index 24fa0544b..c116235c4 100644
--- a/op_builder/scaled_softmax_cuda.py
+++ b/op_builder/scaled_softmax_cuda.py
@@ -3,7 +3,7 @@
 import sys
 
 class ScaledSoftmaxCudaBuilder(CUDAOpBuilder):
-    BUILD_VAR = 'DS_BUILD_SCALED_SOFTMAX_CUDA'
+    BUILD_VAR = 'APEX_BUILD_SCALED_SOFTMAX_CUDA'
     NAME = "scaled_softmax_cuda"
 
     def __init__(self):
diff --git a/op_builder/scaled_upper_triang_masked_softmax_cuda.py b/op_builder/scaled_upper_triang_masked_softmax_cuda.py
index c9f72db6f..a37de9977 100644
--- a/op_builder/scaled_upper_triang_masked_softmax_cuda.py
+++ b/op_builder/scaled_upper_triang_masked_softmax_cuda.py
@@ -1,7 +1,7 @@
 from .builder import CUDAOpBuilder
 
 class ScaledUpperTriangMaskedSoftmaxCudaBuilder(CUDAOpBuilder):
-    BUILD_VAR = 'DS_BUILD_SCALED_UPPER_TRIANG_MASKED_SOFTMAX_CUDA'
+    BUILD_VAR = 'APEX_BUILD_SCALED_UPPER_TRIANG_MASKED_SOFTMAX_CUDA'
     NAME = "scaled_upper_triang_masked_softmax_cuda"
 
     def __init__(self):
diff --git a/op_builder/syncbn.py b/op_builder/syncbn.py
index 6869b69ea..f87420541 100644
--- a/op_builder/syncbn.py
+++ b/op_builder/syncbn.py
@@ -4,7 +4,7 @@
 
 
 class SyncBnBuilder(CUDAOpBuilder):
-    BUILD_VAR = 'DS_BUILD_SYNCBN'
+    BUILD_VAR = 'APEX_BUILD_SYNCBN'
     NAME = "syncbn"
 
     def __init__(self):
diff --git a/op_builder/xentropy.py b/op_builder/xentropy.py
index cf098d192..5f2e02c54 100644
--- a/op_builder/xentropy.py
+++ b/op_builder/xentropy.py
@@ -4,7 +4,7 @@
 
 
 class XentropyBuilder(CUDAOpBuilder):
-    BUILD_VAR = 'DS_BUILD_XENTROPY'
+    BUILD_VAR = 'APEX_BUILD_XENTROPY'
     NAME = "xentropy_cuda"
 
     def __init__(self):
diff --git a/setup.py b/setup.py
index 42f9849fc..55c1a39b9 100644
--- a/setup.py
+++ b/setup.py
@@ -271,8 +271,8 @@ def command_exists(cmd):
 
 
 BUILD_OP_PLATFORM = 1 if sys.platform == "win32" else 0
-BUILD_OP_DEFAULT = int(get_env_if_set('DS_BUILD_OPS', BUILD_OP_PLATFORM))
-print(f"DS_BUILD_OPS={BUILD_OP_DEFAULT}")
+BUILD_OP_DEFAULT = int(get_env_if_set('APEX_BUILD_OPS', BUILD_OP_PLATFORM))
+print(f"APEX_BUILD_OPS={BUILD_OP_DEFAULT}")
 
 ext_modules2 = []
 
@@ -325,7 +325,7 @@ def op_enabled(op_name):
 # Write out version/git info.
 git_hash_cmd = shlex.split("bash -c \"git rev-parse --short HEAD\"")
 git_branch_cmd = shlex.split("bash -c \"git rev-parse --abbrev-ref HEAD\"")
-if command_exists('git') and not is_env_set('DS_BUILD_STRING'):
+if command_exists('git') and not is_env_set('APEX_BUILD_STRING'):
     try:
         result = subprocess.check_output(git_hash_cmd)
         git_hash = result.decode('utf-8').strip()
@@ -338,18 +338,18 @@ def op_enabled(op_name):
     git_hash = "unknown"
     git_branch = "unknown"
 
-# Parse the DeepSpeed version string from version.txt.
+# Parse the apex version string from version.txt.
 version_str = get_apex_version()
 
 # Build specifiers like .devX can be added at install time. Otherwise, add the git hash.
-# Example: `DS_BUILD_STRING=".dev20201022" python -m build --no-isolation`.
+# Example: `APEX_BUILD_STRING=".dev20201022" python -m build --no-isolation`.
 
 # Building wheel for distribution, update version file.
-if is_env_set('DS_BUILD_STRING'):
+if is_env_set('APEX_BUILD_STRING'):
     # Build string env specified, probably building for distribution.
     with open('build.txt', 'w') as fd:
-        fd.write(os.environ['DS_BUILD_STRING'])
-    version_str += os.environ['DS_BUILD_STRING']
+        fd.write(os.environ['APEX_BUILD_STRING'])
+    version_str += os.environ['APEX_BUILD_STRING']
 elif os.path.isfile('build.txt'):
     # build.txt exists, probably installing from distribution.
     with open('build.txt', 'r') as fd:
@@ -410,5 +410,6 @@ def op_enabled(op_name):
     cmdclass={'build_ext': BuildExtension} if ext_modules2 else {},
     extras_require=extras,
     install_requires=required,
+    include_package_data=True
 )
 

From 01f22cd8c374cd83971fa32cc7c25a376932fdd1 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 17 Jul 2025 07:52:47 +0000
Subject: [PATCH 15/79] clean up the flags for the modules based on
 apex/setup.py

---
 op_builder/bnp.py                             |  3 +--
 op_builder/focal_loss.py                      | 10 +++-------
 op_builder/fused_bias_swiglu.py               | 15 +++++++--------
 op_builder/fused_dense.py                     | 10 +---------
 op_builder/fused_layer_norm.py                |  9 ++-------
 op_builder/fused_rope.py                      | 19 +++++++++++--------
 .../generic_scaled_masked_softmax_cuda.py     | 19 +++++++++----------
 op_builder/scaled_masked_softmax_cuda.py      | 19 +++++++++----------
 op_builder/scaled_softmax_cuda.py             | 15 +++++++--------
 ...scaled_upper_triang_masked_softmax_cuda.py | 18 ++++++++----------
 op_builder/syncbn.py                          | 10 +---------
 op_builder/xentropy.py                        |  3 +--
 12 files changed, 60 insertions(+), 90 deletions(-)

diff --git a/op_builder/bnp.py b/op_builder/bnp.py
index 9d8871df2..47fe1fdb3 100644
--- a/op_builder/bnp.py
+++ b/op_builder/bnp.py
@@ -23,8 +23,7 @@ def include_paths(self):
         return ['contrib/csrc', 'csrc']
 
     def cxx_args(self):
-        args = super().cxx_args()
-        return args + self.version_dependent_macros()
+        return self.version_dependent_macros()
 
     def nvcc_args(self):
         return ['-DCUDA_HAS_FP16=1',
diff --git a/op_builder/focal_loss.py b/op_builder/focal_loss.py
index 27693284b..af0fbf31b 100644
--- a/op_builder/focal_loss.py
+++ b/op_builder/focal_loss.py
@@ -19,18 +19,14 @@ def sources(self):
 
     def include_paths(self):
         return ['contrib/csrc/' ]
-
+        
     def cxx_args(self):
         args = super().cxx_args()
         return args + self.version_dependent_macros()
 
     def nvcc_args(self):
-        nvcc_flags = ['-O3'] + self.version_dependent_macros()
         if self.is_rocm_pytorch():
-            ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
-            nvcc_flags += ['-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR, '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR]
+            nvcc_flags = ['-O3'] + self.version_dependent_macros()
         else:
-            nvcc_flags.extend(
-                ['-allow-unsupported-compiler' if sys.platform == "win32" else '', '--ftz=false', '--use_fast_math'] +
-                self.compute_capability_args())
+            nvcc_flags = ['-O3', '--ftz=false', '--use_fast_math']
         return nvcc_flags
\ No newline at end of file
diff --git a/op_builder/fused_bias_swiglu.py b/op_builder/fused_bias_swiglu.py
index 34ac494b5..4b00b8212 100644
--- a/op_builder/fused_bias_swiglu.py
+++ b/op_builder/fused_bias_swiglu.py
@@ -27,18 +27,17 @@ def cxx_args(self):
 
     def nvcc_args(self):
         nvcc_flags = [
-            '-O3',
-            '-U__CUDA_NO_HALF_OPERATORS__',
-            '-U__CUDA_NO_HALF_CONVERSIONS__',
-            '--expt-relaxed-constexpr',
-            '--expt-extended-lambda'
-        ] + self.version_dependent_macros()
-        if self.is_rocm_pytorch():
-            nvcc_flags = [
                 '-O3',
                 '-U__CUDA_NO_HALF_OPERATORS__',
                 '-U__CUDA_NO_HALF_CONVERSIONS__'
             ] + self.version_dependent_macros()
+        if not self.is_rocm_pytorch():
+            nvcc_flags.extend(
+                [
+                    '--expt-relaxed-constexpr',
+                    '--expt-extended-lambda'
+                ])
+        else:
             # Handle ROCm arch flags
             amdgpu_targets = os.environ.get('PYTORCH_ROCM_ARCH', '')
             if not amdgpu_targets:
diff --git a/op_builder/fused_dense.py b/op_builder/fused_dense.py
index 74c8067b4..bef56ee4b 100644
--- a/op_builder/fused_dense.py
+++ b/op_builder/fused_dense.py
@@ -24,12 +24,4 @@ def cxx_args(self):
         return args + self.version_dependent_macros()
 
     def nvcc_args(self):
-        nvcc_flags = ['-O3'] + self.version_dependent_macros()
-        if self.is_rocm_pytorch():
-            ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
-            nvcc_flags += ['-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR, '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR]
-        else:
-            nvcc_flags.extend(
-                ['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math'] +
-                self.compute_capability_args())
-        return nvcc_flags
\ No newline at end of file
+        return ['-O3'] + self.version_dependent_macros()
\ No newline at end of file
diff --git a/op_builder/fused_layer_norm.py b/op_builder/fused_layer_norm.py
index 3572166c7..4e0ca9c8a 100644
--- a/op_builder/fused_layer_norm.py
+++ b/op_builder/fused_layer_norm.py
@@ -25,11 +25,6 @@ def cxx_args(self):
 
     def nvcc_args(self):
         nvcc_flags = ['-O3'] + self.version_dependent_macros()
-        if self.is_rocm_pytorch():
-            ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
-            nvcc_flags += ['-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR, '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR]
-        else:
-            nvcc_flags.extend(
-                ['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math', '-maxrregcount=50'] +
-                self.compute_capability_args())
+        if not self.is_rocm_pytorch():
+            nvcc_flags.extend(['--use_fast_math', '-maxrregcount=50'])
         return nvcc_flags
\ No newline at end of file
diff --git a/op_builder/fused_rope.py b/op_builder/fused_rope.py
index c72501fee..74c580b02 100644
--- a/op_builder/fused_rope.py
+++ b/op_builder/fused_rope.py
@@ -25,12 +25,15 @@ def cxx_args(self):
         return args + self.version_dependent_macros()
 
     def nvcc_args(self):
-        nvcc_flags = ['-O3'] + self.version_dependent_macros()
-        if self.is_rocm_pytorch():
-            ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
-            nvcc_flags += ['-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR, '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR]
-        else:
+        nvcc_flags = [
+                '-O3',
+                '-U__CUDA_NO_HALF_OPERATORS__',
+                '-U__CUDA_NO_HALF_CONVERSIONS__'
+            ] + self.version_dependent_macros()
+        if not self.is_rocm_pytorch():
             nvcc_flags.extend(
-                ['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math'] +
-                self.compute_capability_args())
-        return nvcc_flags
\ No newline at end of file
+                [
+                    '--expt-relaxed-constexpr',
+                    '--expt-extended-lambda'
+                ])
+        return nvcc_flags
diff --git a/op_builder/generic_scaled_masked_softmax_cuda.py b/op_builder/generic_scaled_masked_softmax_cuda.py
index ef2a7978e..1cf963948 100644
--- a/op_builder/generic_scaled_masked_softmax_cuda.py
+++ b/op_builder/generic_scaled_masked_softmax_cuda.py
@@ -24,17 +24,16 @@ def cxx_args(self):
         return args + self.version_dependent_macros()
 
     def nvcc_args(self):
-        if self.is_rocm_pytorch():
-            return [
+        nvcc_flags = [
                 '-O3',
                 '-U__CUDA_NO_HALF_OPERATORS__',
                 '-U__CUDA_NO_HALF_CONVERSIONS__'
             ] + self.version_dependent_macros()
-        else:
-            return [
-                '-O3',
-                '-U__CUDA_NO_HALF_OPERATORS__',
-                '-U__CUDA_NO_HALF_CONVERSIONS__',
-                '--expt-relaxed-constexpr',
-                '--expt-extended-lambda'
-            ] + self.version_dependent_macros()
+        if not self.is_rocm_pytorch():
+            nvcc_flags.extend(
+                [
+                    '--expt-relaxed-constexpr',
+                    '--expt-extended-lambda'
+                ])
+        return nvcc_flags
+
diff --git a/op_builder/scaled_masked_softmax_cuda.py b/op_builder/scaled_masked_softmax_cuda.py
index 4a272646b..ee482ecda 100644
--- a/op_builder/scaled_masked_softmax_cuda.py
+++ b/op_builder/scaled_masked_softmax_cuda.py
@@ -25,17 +25,16 @@ def cxx_args(self):
         return args + self.version_dependent_macros()
 
     def nvcc_args(self):
-        if self.is_rocm_pytorch():
-            return [
+        nvcc_flags = [
                 '-O3',
                 '-U__CUDA_NO_HALF_OPERATORS__',
                 '-U__CUDA_NO_HALF_CONVERSIONS__'
             ] + self.version_dependent_macros()
-        else:
-            return [
-                '-O3',
-                '-U__CUDA_NO_HALF_OPERATORS__',
-                '-U__CUDA_NO_HALF_CONVERSIONS__',
-                '--expt-relaxed-constexpr',
-                '--expt-extended-lambda'
-            ] + self.version_dependent_macros()
+        if not self.is_rocm_pytorch():
+            nvcc_flags.extend(
+                [
+                    '--expt-relaxed-constexpr',
+                    '--expt-extended-lambda'
+                ])
+        return nvcc_flags
+
diff --git a/op_builder/scaled_softmax_cuda.py b/op_builder/scaled_softmax_cuda.py
index c116235c4..21c75eb32 100644
--- a/op_builder/scaled_softmax_cuda.py
+++ b/op_builder/scaled_softmax_cuda.py
@@ -27,16 +27,15 @@ def cxx_args(self):
 
     def nvcc_args(self):
         nvcc_flags = [
-            '-O3',
-            '-U__CUDA_NO_HALF_OPERATORS__',
-            '-U__CUDA_NO_HALF_CONVERSIONS__',
-            '--expt-relaxed-constexpr',
-            '--expt-extended-lambda'
-        ] + self.version_dependent_macros()
-        if self.is_rocm_pytorch():
-            nvcc_flags = [
                 '-O3',
                 '-U__CUDA_NO_HALF_OPERATORS__',
                 '-U__CUDA_NO_HALF_CONVERSIONS__'
             ] + self.version_dependent_macros()
+        if not self.is_rocm_pytorch():
+            nvcc_flags.extend(
+                [
+                    '--expt-relaxed-constexpr',
+                    '--expt-extended-lambda'
+                ])
         return nvcc_flags
+
diff --git a/op_builder/scaled_upper_triang_masked_softmax_cuda.py b/op_builder/scaled_upper_triang_masked_softmax_cuda.py
index a37de9977..045cd75df 100644
--- a/op_builder/scaled_upper_triang_masked_softmax_cuda.py
+++ b/op_builder/scaled_upper_triang_masked_softmax_cuda.py
@@ -24,17 +24,15 @@ def cxx_args(self):
         return args + self.version_dependent_macros()
 
     def nvcc_args(self):
-        if self.is_rocm_pytorch():
-            return [
+        nvcc_flags = [
                 '-O3',
                 '-U__CUDA_NO_HALF_OPERATORS__',
                 '-U__CUDA_NO_HALF_CONVERSIONS__'
             ] + self.version_dependent_macros()
-        else:
-            return [
-                '-O3',
-                '-U__CUDA_NO_HALF_OPERATORS__',
-                '-U__CUDA_NO_HALF_CONVERSIONS__',
-                '--expt-relaxed-constexpr',
-                '--expt-extended-lambda'
-            ] + self.version_dependent_macros()
+        if not self.is_rocm_pytorch():
+            nvcc_flags.extend(
+                [
+                    '--expt-relaxed-constexpr',
+                    '--expt-extended-lambda'
+                ])
+        return nvcc_flags
diff --git a/op_builder/syncbn.py b/op_builder/syncbn.py
index f87420541..1e640aab2 100644
--- a/op_builder/syncbn.py
+++ b/op_builder/syncbn.py
@@ -24,12 +24,4 @@ def cxx_args(self):
         return args + self.version_dependent_macros()
 
     def nvcc_args(self):
-        nvcc_flags = ['-O3'] + self.version_dependent_macros()
-        if self.is_rocm_pytorch():
-            ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
-            nvcc_flags += ['-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR, '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR]
-        else:
-            nvcc_flags.extend(
-                ['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math'] +
-                self.compute_capability_args())
-        return nvcc_flags
\ No newline at end of file
+        return ['-O3'] + self.version_dependent_macros()
\ No newline at end of file
diff --git a/op_builder/xentropy.py b/op_builder/xentropy.py
index 5f2e02c54..fbe6f702c 100644
--- a/op_builder/xentropy.py
+++ b/op_builder/xentropy.py
@@ -25,5 +25,4 @@ def cxx_args(self):
         return args + self.version_dependent_macros()
 
     def nvcc_args(self):
-        nvcc_flags = ['-O3'] + self.version_dependent_macros() 
-        return nvcc_flags
\ No newline at end of file
+        return ['-O3'] + self.version_dependent_macros()
\ No newline at end of file

From 58d87ade81a102041d6856e4bfd2818117eba409 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 17 Jul 2025 08:17:13 +0000
Subject: [PATCH 16/79] add function to get the backward_pass_guard_args in
 CudaOpBuilder and make MLP JIT compile

---
 apex/mlp/mlp.py       |  3 ++-
 op_builder/builder.py | 23 +++++++++++++++++++++++
 op_builder/mlp.py     | 31 +++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+), 1 deletion(-)
 create mode 100644 op_builder/mlp.py

diff --git a/apex/mlp/mlp.py b/apex/mlp/mlp.py
index 31b853292..fe3d382fd 100644
--- a/apex/mlp/mlp.py
+++ b/apex/mlp/mlp.py
@@ -5,8 +5,9 @@
 from torch import nn
 
 from apex._autocast_utils import _cast_if_autocast_enabled
-import mlp_cuda
+from apex.op_builder import MlpBuilder  
 
+mlp_cuda = MlpBuilder().load()
 
 class MlpFunction(torch.autograd.Function):
     @staticmethod
diff --git a/op_builder/builder.py b/op_builder/builder.py
index 1c34e7652..ffcdc8caf 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -811,6 +811,29 @@ def libraries_args(self):
         else:
             return []
 
+    def backward_pass_guard_args(self):
+        torch_dir = torch.__path__[0]
+        context_file = os.path.join(torch_dir, "include", "ATen", "Context.h")
+        if os.path.exists(context_file):
+            lines = open(context_file, 'r').readlines()
+            found_Backward_Pass_Guard = False
+            found_ROCmBackward_Pass_Guard = False
+            for line in lines:
+                if "BackwardPassGuard" in line:
+                    # BackwardPassGuard has been renamed to ROCmBackwardPassGuard
+                    # https://github.com/pytorch/pytorch/pull/71881/commits/4b82f5a67a35406ffb5691c69e6b4c9086316a43
+                    if "ROCmBackwardPassGuard" in line:
+                        found_ROCmBackward_Pass_Guard = True
+                    else:
+                        found_Backward_Pass_Guard = True
+                    break
+        backward_pass_guard_args = []
+        if found_Backward_Pass_Guard:
+            backward_pass_guard_args += ['-DBACKWARD_PASS_GUARD'] + ['-DBACKWARD_PASS_GUARD_CLASS=BackwardPassGuard']
+        if found_ROCmBackward_Pass_Guard:
+            backward_pass_guard_args += ['-DBACKWARD_PASS_GUARD'] + ['-DBACKWARD_PASS_GUARD_CLASS=ROCmBackwardPassGuard']
+        return backward_pass_guard_args
+
 
 class TorchCPUOpBuilder(CUDAOpBuilder):
 
diff --git a/op_builder/mlp.py b/op_builder/mlp.py
new file mode 100644
index 000000000..dd55679e3
--- /dev/null
+++ b/op_builder/mlp.py
@@ -0,0 +1,31 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class MlpBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_MLP'
+    NAME = "mlp_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ['csrc/mlp.cpp',
+                'csrc/mlp_cuda.cu']
+
+    def include_paths(self):
+        return ['csrc']
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = ['-O3'] + self.version_dependent_macros()
+        if self.is_rocm_pytorch():
+            nvcc_flags.extend(self.backward_pass_guard_args())
+        return nvcc_flags 
\ No newline at end of file

From d47d871c445fb72136367cfa0bfd04d447be6939 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 17 Jul 2025 08:37:36 +0000
Subject: [PATCH 17/79] add fused weight gradient mlp to jit compile

---
 apex/transformer/tensor_parallel/layers.py |  3 +-
 op_builder/fused_weight_gradient_mlp.py    | 42 ++++++++++++++++++++++
 2 files changed, 44 insertions(+), 1 deletion(-)
 create mode 100644 op_builder/fused_weight_gradient_mlp.py

diff --git a/apex/transformer/tensor_parallel/layers.py b/apex/transformer/tensor_parallel/layers.py
index 346dfaa7a..5588569c7 100644
--- a/apex/transformer/tensor_parallel/layers.py
+++ b/apex/transformer/tensor_parallel/layers.py
@@ -54,7 +54,8 @@
 
 _grad_accum_fusion_available = True
 try:
-    import fused_weight_gradient_mlp_cuda
+    from apex.op_builder import FusedWeightGradientMlpCudaBuilder  
+    fused_weight_gradient_mlp_cuda = FusedWeightGradientMlpCudaBuilder().load()
 except ImportError:
     _grad_accum_fusion_available = False
 
diff --git a/op_builder/fused_weight_gradient_mlp.py b/op_builder/fused_weight_gradient_mlp.py
new file mode 100644
index 000000000..040c465a0
--- /dev/null
+++ b/op_builder/fused_weight_gradient_mlp.py
@@ -0,0 +1,42 @@
+from .builder import CUDAOpBuilder
+
+class FusedWeightGradientMlpCudaBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_FUSED_WEIGHT_GRADIENT_MLP'
+    NAME = "fused_weight_gradient_mlp_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return [
+            "csrc/megatron/fused_weight_gradient_dense.cpp",
+            "csrc/megatron/fused_weight_gradient_dense_cuda.cu",
+            "csrc/megatron/fused_weight_gradient_dense_16bit_prec_cuda.cu",
+        ]
+
+    def include_paths(self):
+        # Both csrc and csrc/megatron are included in the original extension
+        return ['csrc', 'csrc/megatron']
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = [
+                '-O3',
+                '-U__CUDA_NO_HALF_OPERATORS__',
+                '-U__CUDA_NO_HALF_CONVERSIONS__'
+            ] + self.version_dependent_macros()
+        if not self.is_rocm_pytorch():
+            nvcc_flags.extend(
+                [
+                    '--expt-relaxed-constexpr',
+                    '--expt-extended-lambda',
+                    "--use_fast_math"
+                ]) + self.compute_capability_args()
+        return nvcc_flags
+

From fc60c28eb335fa8d20f4e593a8717719fe46223f Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 17 Jul 2025 08:57:01 +0000
Subject: [PATCH 18/79] move fused_weight_gradient_mlp_cuda load inside so that
 it is not compiled during apex installation

---
 apex/transformer/tensor_parallel/layers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/apex/transformer/tensor_parallel/layers.py b/apex/transformer/tensor_parallel/layers.py
index 5588569c7..0644f58f7 100644
--- a/apex/transformer/tensor_parallel/layers.py
+++ b/apex/transformer/tensor_parallel/layers.py
@@ -54,8 +54,7 @@
 
 _grad_accum_fusion_available = True
 try:
-    from apex.op_builder import FusedWeightGradientMlpCudaBuilder  
-    fused_weight_gradient_mlp_cuda = FusedWeightGradientMlpCudaBuilder().load()
+    from apex.op_builder import FusedWeightGradientMlpCudaBuilder
 except ImportError:
     _grad_accum_fusion_available = False
 
@@ -364,6 +363,7 @@ def backward(ctx, grad_output):
             )
 
         if ctx.gradient_accumulation_fusion:
+            fused_weight_gradient_mlp_cuda = FusedWeightGradientMlpCudaBuilder().load()
             if not ctx.use_16bit_in_wgrad_accum_fusion:
                 fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(
                     total_input, grad_output, weight.main_grad

From ad7439a0209d79abd28c3b110f44760503cf88bc Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 17 Jul 2025 08:59:14 +0000
Subject: [PATCH 19/79] make fused index mul 2d jit compile and dd aten atomic
 header flag method to CUDAOpBuilder to support its jit compile

---
 apex/contrib/index_mul_2d/index_mul_2d.py |  4 +--
 op_builder/builder.py                     |  6 +++++
 op_builder/fused_index_mul_2d.py          | 33 +++++++++++++++++++++++
 3 files changed, 41 insertions(+), 2 deletions(-)
 create mode 100644 op_builder/fused_index_mul_2d.py

diff --git a/apex/contrib/index_mul_2d/index_mul_2d.py b/apex/contrib/index_mul_2d/index_mul_2d.py
index 1d34fe20c..3ecfff888 100644
--- a/apex/contrib/index_mul_2d/index_mul_2d.py
+++ b/apex/contrib/index_mul_2d/index_mul_2d.py
@@ -1,6 +1,6 @@
 import torch
-
-import fused_index_mul_2d
+from apex.op_builder import FusedIndexMul2dBuilder  
+fused_index_mul_2d = FusedIndexMul2dBuilder().load()
 
 class IndexMul2d_(torch.autograd.Function):
     '''
diff --git a/op_builder/builder.py b/op_builder/builder.py
index ffcdc8caf..bb9e963a6 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -834,6 +834,12 @@ def backward_pass_guard_args(self):
             backward_pass_guard_args += ['-DBACKWARD_PASS_GUARD'] + ['-DBACKWARD_PASS_GUARD_CLASS=ROCmBackwardPassGuard']
         return backward_pass_guard_args
 
+    def aten_atomic_args(self):
+        torch_dir = torch.__path__[0]
+        if os.path.exists(os.path.join(torch_dir, "include", "ATen", "Atomic.cuh")):
+            return ['-DATEN_ATOMIC_HEADER']
+        else:
+            return []
 
 class TorchCPUOpBuilder(CUDAOpBuilder):
 
diff --git a/op_builder/fused_index_mul_2d.py b/op_builder/fused_index_mul_2d.py
new file mode 100644
index 000000000..2a99bcc51
--- /dev/null
+++ b/op_builder/fused_index_mul_2d.py
@@ -0,0 +1,33 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class FusedIndexMul2dBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_FUSED_INDEX_MUL_2D'
+    NAME = "fused_index_mul_2d"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ['contrib/csrc/index_mul_2d/index_mul_2d_cuda.cpp',
+                'contrib/csrc/index_mul_2d/index_mul_2d_cuda_kernel.cu']
+
+    def include_paths(self):
+        return ['contrib/csrc/']
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = ['-O3'] + self.version_dependent_macros()
+        if not self.is_rocm_pytorch():
+            nvcc_flags += ['--use_fast_math', '--ftz=false']
+        else:
+            nvcc_flags += self.aten_atomic_args()
+        return nvcc_flags
\ No newline at end of file

From b2a26fbcd534c0f65fe50ad650ae7281e4cc93ff Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 17 Jul 2025 12:10:36 +0000
Subject: [PATCH 20/79] make fast multihead attention as jit module, add
 generator_args to CudaOpBuilder support jit of this module

---
 .../fast_encdec_multihead_attn_func.py        |  4 +-
 ...ast_encdec_multihead_attn_norm_add_func.py |  3 +-
 .../fast_self_multihead_attn_func.py          |  3 +-
 .../fast_self_multihead_attn_norm_add_func.py |  3 +-
 .../mask_softmax_dropout_func.py              |  3 +-
 op_builder/builder.py                         |  8 +++
 op_builder/fast_multihead_attn.py             | 49 +++++++++++++++++++
 7 files changed, 67 insertions(+), 6 deletions(-)
 create mode 100644 op_builder/fast_multihead_attn.py

diff --git a/apex/contrib/multihead_attn/fast_encdec_multihead_attn_func.py b/apex/contrib/multihead_attn/fast_encdec_multihead_attn_func.py
index 9431a4936..ba6d865ca 100644
--- a/apex/contrib/multihead_attn/fast_encdec_multihead_attn_func.py
+++ b/apex/contrib/multihead_attn/fast_encdec_multihead_attn_func.py
@@ -1,7 +1,7 @@
 import torch
 
-import fast_multihead_attn
-
+from apex.op_builder import FastMultiheadAttnBuilder  
+fast_multihead_attn = FastMultiheadAttnBuilder().load()
 
 class FastEncdecAttnFunc(torch.autograd.Function):
     @staticmethod
diff --git a/apex/contrib/multihead_attn/fast_encdec_multihead_attn_norm_add_func.py b/apex/contrib/multihead_attn/fast_encdec_multihead_attn_norm_add_func.py
index 320bebd66..516e888a9 100644
--- a/apex/contrib/multihead_attn/fast_encdec_multihead_attn_norm_add_func.py
+++ b/apex/contrib/multihead_attn/fast_encdec_multihead_attn_norm_add_func.py
@@ -7,7 +7,8 @@
 
 import torch
 
-import fast_multihead_attn
+from apex.op_builder import FastMultiheadAttnBuilder  
+fast_multihead_attn = FastMultiheadAttnBuilder().load()
 
 
 class FastEncdecAttnNormAddFunc(torch.autograd.Function):
diff --git a/apex/contrib/multihead_attn/fast_self_multihead_attn_func.py b/apex/contrib/multihead_attn/fast_self_multihead_attn_func.py
index 6b50fe227..9cbe22a58 100644
--- a/apex/contrib/multihead_attn/fast_self_multihead_attn_func.py
+++ b/apex/contrib/multihead_attn/fast_self_multihead_attn_func.py
@@ -1,6 +1,7 @@
 import torch
 
-import fast_multihead_attn
+from apex.op_builder import FastMultiheadAttnBuilder  
+fast_multihead_attn = FastMultiheadAttnBuilder().load()
 
 
 class FastSelfAttnFunc(torch.autograd.Function):
diff --git a/apex/contrib/multihead_attn/fast_self_multihead_attn_norm_add_func.py b/apex/contrib/multihead_attn/fast_self_multihead_attn_norm_add_func.py
index 7f110cb33..f0a0dc2a8 100644
--- a/apex/contrib/multihead_attn/fast_self_multihead_attn_norm_add_func.py
+++ b/apex/contrib/multihead_attn/fast_self_multihead_attn_norm_add_func.py
@@ -1,6 +1,7 @@
 import torch
 
-import fast_multihead_attn
+from apex.op_builder import FastMultiheadAttnBuilder  
+fast_multihead_attn = FastMultiheadAttnBuilder().load()
 
 
 class FastSelfAttnNormAddFunc(torch.autograd.Function):
diff --git a/apex/contrib/multihead_attn/mask_softmax_dropout_func.py b/apex/contrib/multihead_attn/mask_softmax_dropout_func.py
index b34eec444..e92703b21 100644
--- a/apex/contrib/multihead_attn/mask_softmax_dropout_func.py
+++ b/apex/contrib/multihead_attn/mask_softmax_dropout_func.py
@@ -1,6 +1,7 @@
 import torch
 
-import fast_multihead_attn
+from apex.op_builder import FastMultiheadAttnBuilder  
+fast_multihead_attn = FastMultiheadAttnBuilder().load()
 
 
 class MaskSoftmaxDropout(torch.autograd.Function):
diff --git a/op_builder/builder.py b/op_builder/builder.py
index bb9e963a6..6f2242686 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -841,6 +841,14 @@ def aten_atomic_args(self):
         else:
             return []
 
+    def generator_args(self):
+        generator_flag = []
+        torch_dir = torch.__path__[0]
+        if os.path.exists(os.path.join(torch_dir, "include", "ATen", "CUDAGeneratorImpl.h")):
+            generator_flag = ["-DOLD_GENERATOR_PATH"]
+        return generator_flag
+
+
 class TorchCPUOpBuilder(CUDAOpBuilder):
 
     def get_cuda_lib64_path(self):
diff --git a/op_builder/fast_multihead_attn.py b/op_builder/fast_multihead_attn.py
new file mode 100644
index 000000000..737c18d2e
--- /dev/null
+++ b/op_builder/fast_multihead_attn.py
@@ -0,0 +1,49 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class FastMultiheadAttnBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_FAST_MULTIHEAD_ATTN'
+    NAME = "fast_multihead_attn"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ['contrib/csrc/multihead_attn/multihead_attn_frontend.cpp',
+                    'contrib/csrc/multihead_attn/additive_masked_softmax_dropout_cuda.cu',
+                    "contrib/csrc/multihead_attn/masked_softmax_dropout_cuda.cu",
+                    "contrib/csrc/multihead_attn/encdec_multihead_attn_cuda.cu",
+                    "contrib/csrc/multihead_attn/encdec_multihead_attn_norm_add_cuda.cu",
+                    "contrib/csrc/multihead_attn/self_multihead_attn_cuda.cu",
+                    "contrib/csrc/multihead_attn/self_multihead_attn_bias_additive_mask_cuda.cu",
+                    "contrib/csrc/multihead_attn/self_multihead_attn_bias_cuda.cu",
+                    "contrib/csrc/multihead_attn/self_multihead_attn_norm_add_cuda.cu"]
+
+    def include_paths(self):
+        return ['csrc/',
+                'contrib/csrc/',
+                'contrib/csrc/multihead_attn']
+        
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros() + self.generator_args()
+
+    def nvcc_args(self):
+        nvcc_flags = ['-O3'] + self.version_dependent_macros() + self.generator_args() 
+        if not self.is_rocm_pytorch():
+            nvcc_flags += ['-U__CUDA_NO_HALF_OPERATORS__',
+                         '-U__CUDA_NO_HALF_CONVERSIONS__',
+                         '--expt-relaxed-constexpr',
+                         '--expt-extended-lambda',
+                         '--use_fast_math'] + self.compute_capability_args()
+        else:
+            nvcc_flags += ['-I/opt/rocm/include/hiprand',
+                          '-I/opt/rocm/include/rocrand',
+                          '-U__HIP_NO_HALF_OPERATORS__',
+                          '-U__HIP_NO_HALF_CONVERSIONS__'] + self.backward_pass_guard_args()
+        return nvcc_flags
\ No newline at end of file

From 8acc5f5700d0c08e01639679e6e5853427f91b03 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 17 Jul 2025 12:35:55 +0000
Subject: [PATCH 21/79] make transducer loss and transducer joint modules as
 jit modules, add nvcc_threads_args method in CUDAOpBuilder to support these
 jit modules

---
 apex/contrib/transducer/transducer.py |  5 +++--
 op_builder/builder.py                 | 15 +++++++++++++
 op_builder/transducer_joint.py        | 32 +++++++++++++++++++++++++++
 op_builder/transducer_loss.py         | 30 +++++++++++++++++++++++++
 4 files changed, 80 insertions(+), 2 deletions(-)
 create mode 100644 op_builder/transducer_joint.py
 create mode 100644 op_builder/transducer_loss.py

diff --git a/apex/contrib/transducer/transducer.py b/apex/contrib/transducer/transducer.py
index 784396275..c0a57dea7 100755
--- a/apex/contrib/transducer/transducer.py
+++ b/apex/contrib/transducer/transducer.py
@@ -1,6 +1,7 @@
 import torch
-import transducer_loss_cuda
-import transducer_joint_cuda
+from apex.op_builder import TransducerJointBuilder, TransducerLossBuilder  
+transducer_loss_cuda = TransducerLossBuilder().load()
+transducer_joint_cuda = TransducerJointBuilder().load()
 
 class TransducerJoint(torch.nn.Module):
     """Transducer joint
diff --git a/op_builder/builder.py b/op_builder/builder.py
index 6f2242686..b7bf3b0cd 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -848,6 +848,21 @@ def generator_args(self):
             generator_flag = ["-DOLD_GENERATOR_PATH"]
         return generator_flag
 
+    def get_cuda_bare_metal_version(cuda_dir):
+        raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
+        output = raw_output.split()
+        release_idx = output.index("release") + 1
+        release = output[release_idx].split(".")
+        bare_metal_major = release[0]
+        bare_metal_minor = release[1][0]
+        return raw_output, bare_metal_major, bare_metal_minor
+
+    def nvcc_threads_args(self):
+        cuda_major, cuda_minor = installed_cuda_version(name)
+        if cuda_major >= 11 and cuda_minor >= 2:
+            return ["--threads", "4"]
+        return []
+
 
 class TorchCPUOpBuilder(CUDAOpBuilder):
 
diff --git a/op_builder/transducer_joint.py b/op_builder/transducer_joint.py
new file mode 100644
index 000000000..00e26c72d
--- /dev/null
+++ b/op_builder/transducer_joint.py
@@ -0,0 +1,32 @@
+from .builder import CUDAOpBuilder
+import sys
+
+
+class TransducerJointBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_TRANSDUCER_JOINT'
+    NAME = "transducer_joint_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ["contrib/csrc/transducer/transducer_joint.cpp",
+                "contrib/csrc/transducer/transducer_joint_kernel.cu"]
+
+    def include_paths(self):
+        return ['contrib/csrc/',
+                #it uses philox.cuh from contrib/csrc/multihead_attn
+                'contrib/csrc/multihead_attn'] 
+        
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros() + self.generator_args() 
+
+    def nvcc_args(self):
+        nvcc_flags = ['-O3'] + self.version_dependent_macros() + self.generator_args() 
+        if not self.is_rocm_pytorch():
+            nvcc_flags += self.nvcc_threads_args()
+        return nvcc_flags
\ No newline at end of file
diff --git a/op_builder/transducer_loss.py b/op_builder/transducer_loss.py
new file mode 100644
index 000000000..cb25a350e
--- /dev/null
+++ b/op_builder/transducer_loss.py
@@ -0,0 +1,30 @@
+from .builder import CUDAOpBuilder
+import sys
+
+
+class TransducerLossBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_TRANSDUCER_LOSS'
+    NAME = "transducer_loss_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ["contrib/csrc/transducer/transducer_loss.cpp",
+                "contrib/csrc/transducer/transducer_loss_kernel.cu"]
+
+    def include_paths(self):
+        return ['contrib/csrc/' ]
+        
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = ['-O3'] + self.version_dependent_macros() 
+        if not self.is_rocm_pytorch():
+            nvcc_flags += self.nvcc_threads_args()
+        return nvcc_flags
\ No newline at end of file

From 1718d3a90df409b06b1b8a5148f82617080575a5 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 17 Jul 2025 12:47:17 +0000
Subject: [PATCH 22/79] remove extra method - installed_cuda_version from
 CUDAOpBuilder

---
 op_builder/builder.py | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/op_builder/builder.py b/op_builder/builder.py
index b7bf3b0cd..bbb5a8ebb 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -848,17 +848,8 @@ def generator_args(self):
             generator_flag = ["-DOLD_GENERATOR_PATH"]
         return generator_flag
 
-    def get_cuda_bare_metal_version(cuda_dir):
-        raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
-        output = raw_output.split()
-        release_idx = output.index("release") + 1
-        release = output[release_idx].split(".")
-        bare_metal_major = release[0]
-        bare_metal_minor = release[1][0]
-        return raw_output, bare_metal_major, bare_metal_minor
-
     def nvcc_threads_args(self):
-        cuda_major, cuda_minor = installed_cuda_version(name)
+        cuda_major, cuda_minor = installed_cuda_version()
         if cuda_major >= 11 and cuda_minor >= 2:
             return ["--threads", "4"]
         return []

From 844c8d47612a934a6b9f4aa8276c82de8a768fc2 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 17 Jul 2025 13:23:42 +0000
Subject: [PATCH 23/79] add apex_C module to jit compile, add py-cpuinfo to
 requirements.txt as it is needed for TorchCPUOpBuilder

---
 op_builder/apex_C.py | 24 ++++++++++++++++++++++++
 requirements.txt     |  3 ++-
 2 files changed, 26 insertions(+), 1 deletion(-)
 create mode 100644 op_builder/apex_C.py

diff --git a/op_builder/apex_C.py b/op_builder/apex_C.py
new file mode 100644
index 000000000..c0f3aa098
--- /dev/null
+++ b/op_builder/apex_C.py
@@ -0,0 +1,24 @@
+from .builder import TorchCPUOpBuilder
+
+import sys
+
+
+class ApexCBuilder(TorchCPUOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_C'
+    NAME = "apex_C"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ["csrc/flatten_unflatten.cpp"]
+
+    def include_paths(self):
+        return ['csrc/' ]
+        
+    def libraries_args(self):
+        args = super().libraries_args()
+        return args
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 241f90a94..53ae8d256 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,4 +5,5 @@ PyYAML>=5.1
 pytest>=3.5.1
 packaging>=14.0
 matplotlib>=3.8
-pandas>=2.2.2
\ No newline at end of file
+pandas>=2.2.2
+py-cpuinfo
\ No newline at end of file

From 08939ea45462a425eb96f1981e1c784737d6e043 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 17 Jul 2025 13:46:05 +0000
Subject: [PATCH 24/79] make nccl allocator as a jit compile module, add
 nccl_args method to CUDAOpBuilder to support this

---
 apex/contrib/nccl_allocator/nccl_allocator.py |  3 ++-
 op_builder/builder.py                         |  5 ++++
 op_builder/nccl_allocator.py                  | 27 +++++++++++++++++++
 3 files changed, 34 insertions(+), 1 deletion(-)
 create mode 100644 op_builder/nccl_allocator.py

diff --git a/apex/contrib/nccl_allocator/nccl_allocator.py b/apex/contrib/nccl_allocator/nccl_allocator.py
index 62fcee756..0700f907f 100644
--- a/apex/contrib/nccl_allocator/nccl_allocator.py
+++ b/apex/contrib/nccl_allocator/nccl_allocator.py
@@ -1,6 +1,7 @@
 import os
 import torch
-import _apex_nccl_allocator
+from apex.op_builder import NCCLAllocatorBuilder
+_apex_nccl_allocator = NCCLAllocatorBuilder().load()
 
 from contextlib import nullcontext
 
diff --git a/op_builder/builder.py b/op_builder/builder.py
index bbb5a8ebb..032640e94 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -854,6 +854,11 @@ def nvcc_threads_args(self):
             return ["--threads", "4"]
         return []
 
+    def nccl_args(self):
+        nccl_library = ["-lnccl"]
+        if self.is_rocm_pytorch():
+            nccl_library = ["-lrccl"]
+        return nccl_library
 
 class TorchCPUOpBuilder(CUDAOpBuilder):
 
diff --git a/op_builder/nccl_allocator.py b/op_builder/nccl_allocator.py
new file mode 100644
index 000000000..10bffcdb2
--- /dev/null
+++ b/op_builder/nccl_allocator.py
@@ -0,0 +1,27 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class NCCLAllocatorBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_NCCL_ALLOCATOR'
+    NAME = "_apex_nccl_allocator"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ["contrib/csrc/nccl_allocator/NCCLAllocator.cpp"]
+
+    def include_paths(self):
+        return ['contrib/csrc/']
+        
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros() + self.generator_args()
+
+    def nvcc_args(self):
+        return self.nccl_args()
\ No newline at end of file

From fb451c9fdaef45de744f74147e24ad79b05005c4 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 17 Jul 2025 14:52:25 +0000
Subject: [PATCH 25/79] make amp_C as a jit module

---
 apex/multi_tensor_apply/multi_tensor_apply.py |  3 +-
 op_builder/amp_C.py                           | 44 +++++++++++++++++++
 tests/L0/run_amp/test_fused_sgd.py            |  3 +-
 tests/L0/run_amp/test_multi_tensor_axpby.py   |  3 +-
 tests/L0/run_amp/test_multi_tensor_l2norm.py  |  3 +-
 tests/L0/run_amp/test_multi_tensor_scale.py   |  3 +-
 tests/L0/run_optimizers/test_lamb.py          |  3 +-
 7 files changed, 56 insertions(+), 6 deletions(-)
 create mode 100644 op_builder/amp_C.py

diff --git a/apex/multi_tensor_apply/multi_tensor_apply.py b/apex/multi_tensor_apply/multi_tensor_apply.py
index 346c6e50f..94ff3fe73 100644
--- a/apex/multi_tensor_apply/multi_tensor_apply.py
+++ b/apex/multi_tensor_apply/multi_tensor_apply.py
@@ -6,7 +6,8 @@ class MultiTensorApply(object):
 
     def __init__(self, chunk_size):
         try:
-            import amp_C
+            from apex.op_builder import AmpCBuilder
+            amp_C = AmpCBuilder().load()
             MultiTensorApply.available = True
             self.chunk_size = chunk_size
         except ImportError as err:
diff --git a/op_builder/amp_C.py b/op_builder/amp_C.py
new file mode 100644
index 000000000..563be0dc6
--- /dev/null
+++ b/op_builder/amp_C.py
@@ -0,0 +1,44 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class AmpCBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_AMP_C'
+    NAME = "amp_C"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ['csrc/amp_C_frontend.cpp',
+                'csrc/multi_tensor_sgd_kernel.cu',
+                'csrc/multi_tensor_scale_kernel.cu',
+                'csrc/multi_tensor_axpby_kernel.cu',
+                'csrc/multi_tensor_l2norm_kernel.cu',
+                'csrc/multi_tensor_l2norm_kernel_mp.cu',
+                'csrc/multi_tensor_l2norm_scale_kernel.cu',
+                'csrc/multi_tensor_lamb_stage_1.cu',
+                'csrc/multi_tensor_lamb_stage_2.cu',
+                'csrc/multi_tensor_adam.cu',
+                'csrc/multi_tensor_adagrad.cu',
+                'csrc/multi_tensor_novograd.cu',
+                'csrc/multi_tensor_lars.cu',
+                'csrc/multi_tensor_lamb.cu',
+                'csrc/multi_tensor_lamb_mp.cu']
+
+    def include_paths(self):
+        return ['csrc/']
+        
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = ['-O3'] + self.version_dependent_macros()
+        if not self.is_rocm_pytorch():
+            nvcc_flags += ['-lineinfo', '--use_fast_math']
+        return nvcc_flags
\ No newline at end of file
diff --git a/tests/L0/run_amp/test_fused_sgd.py b/tests/L0/run_amp/test_fused_sgd.py
index 480cd1132..99d01855a 100644
--- a/tests/L0/run_amp/test_fused_sgd.py
+++ b/tests/L0/run_amp/test_fused_sgd.py
@@ -14,7 +14,8 @@
     ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
 
 try:
-  import amp_C
+  from apex.op_builder import AmpCBuilder
+  amp_C = AmpCBuilder().load()
   disabled = False
   from apex.optimizers import FusedSGD as FusedSGD
 except ImportError as err:
diff --git a/tests/L0/run_amp/test_multi_tensor_axpby.py b/tests/L0/run_amp/test_multi_tensor_axpby.py
index 4921378a2..bbab05aa6 100644
--- a/tests/L0/run_amp/test_multi_tensor_axpby.py
+++ b/tests/L0/run_amp/test_multi_tensor_axpby.py
@@ -13,7 +13,8 @@
     ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT, common_reset
 
 try:
-  import amp_C
+  from apex.op_builder import AmpCBuilder
+  amp_C = AmpCBuilder().load()
   from amp_C import multi_tensor_axpby
   from apex.multi_tensor_apply import MultiTensorApply
   disabled = False
diff --git a/tests/L0/run_amp/test_multi_tensor_l2norm.py b/tests/L0/run_amp/test_multi_tensor_l2norm.py
index bb28e52d2..d546690de 100644
--- a/tests/L0/run_amp/test_multi_tensor_l2norm.py
+++ b/tests/L0/run_amp/test_multi_tensor_l2norm.py
@@ -12,7 +12,8 @@
     ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT, common_reset
 
 try:
-  import amp_C
+  from apex.op_builder import AmpCBuilder
+  amp_C = AmpCBuilder().load()
   from amp_C import multi_tensor_l2norm
   from apex.multi_tensor_apply import MultiTensorApply
   disabled = False
diff --git a/tests/L0/run_amp/test_multi_tensor_scale.py b/tests/L0/run_amp/test_multi_tensor_scale.py
index f97109c9e..85f60fd0e 100644
--- a/tests/L0/run_amp/test_multi_tensor_scale.py
+++ b/tests/L0/run_amp/test_multi_tensor_scale.py
@@ -12,7 +12,8 @@
     ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT, common_reset
 
 try:
-  import amp_C
+  from apex.op_builder import AmpCBuilder
+  amp_C = AmpCBuilder().load()
   from amp_C import multi_tensor_scale
   from apex.multi_tensor_apply import MultiTensorApply
   disabled = False
diff --git a/tests/L0/run_optimizers/test_lamb.py b/tests/L0/run_optimizers/test_lamb.py
index c6ef9aa95..eb7314600 100644
--- a/tests/L0/run_optimizers/test_lamb.py
+++ b/tests/L0/run_optimizers/test_lamb.py
@@ -38,7 +38,8 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0
         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
         super(RefLAMB, self).__init__(params, defaults)
         if multi_tensor_applier.available:
-            import amp_C
+            from apex.op_builder import AmpCBuilder
+            amp_C = AmpCBuilder().load()
             self.multi_tensor_l2norm=amp_C.multi_tensor_l2norm
             # Skip buffer
             self._dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device=self.param_groups[0]["params"][0].device)

From c6daabdcd92aa96a1ccfb16f1ea81c512bfefcb4 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 17 Jul 2025 15:11:05 +0000
Subject: [PATCH 26/79] add a few uses of amp_C jit module

---
 apex/contrib/optimizers/fp16_optimizer.py | 3 ++-
 apex/contrib/optimizers/fused_sgd.py      | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/apex/contrib/optimizers/fp16_optimizer.py b/apex/contrib/optimizers/fp16_optimizer.py
index 0cbb63b82..2171e181b 100755
--- a/apex/contrib/optimizers/fp16_optimizer.py
+++ b/apex/contrib/optimizers/fp16_optimizer.py
@@ -54,7 +54,8 @@ def __init__(self,
             param_group['params'] = fp32_group
 
         if multi_tensor_applier.available:
-            import amp_C
+            from apex.op_builder import AmpCBuilder
+            amp_C = AmpCBuilder().load()
             self.overflow_buf = torch.cuda.IntTensor([0])
             self.multi_tensor_l2norm=amp_C.multi_tensor_l2norm
         else:
diff --git a/apex/contrib/optimizers/fused_sgd.py b/apex/contrib/optimizers/fused_sgd.py
index 83587c6a6..333e50288 100644
--- a/apex/contrib/optimizers/fused_sgd.py
+++ b/apex/contrib/optimizers/fused_sgd.py
@@ -83,7 +83,8 @@ def __init__(self, params, lr=required, momentum=0, dampening=0,
         self.wd_after_momentum = wd_after_momentum
 
         if multi_tensor_applier.available:
-            import amp_C
+            from apex.op_builder import AmpCBuilder
+            amp_C = AmpCBuilder().load()
             # Skip buffer
             self._dummy_overflow_buf = torch.cuda.IntTensor([0])
             self.multi_tensor_sgd = amp_C.multi_tensor_sgd

From b2218256c49203a9afd13e61fb1b5de554ff3d97 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 17 Jul 2025 15:25:54 +0000
Subject: [PATCH 27/79] add a few uses of amp_C jit module

---
 apex/optimizers/fused_adagrad.py              | 3 ++-
 apex/optimizers/fused_lars.py                 | 3 ++-
 apex/optimizers/fused_mixed_precision_lamb.py | 3 ++-
 apex/optimizers/fused_novograd.py             | 3 ++-
 apex/optimizers/fused_sgd.py                  | 3 ++-
 5 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/apex/optimizers/fused_adagrad.py b/apex/optimizers/fused_adagrad.py
index 8d1ef6f32..843841113 100644
--- a/apex/optimizers/fused_adagrad.py
+++ b/apex/optimizers/fused_adagrad.py
@@ -49,7 +49,8 @@ def __init__(self, params, lr=1e-2, eps=1e-10,
         self.set_grad_none = set_grad_none
 
         if multi_tensor_applier.available:
-            import amp_C
+            from apex.op_builder import AmpCBuilder
+            amp_C = AmpCBuilder().load()
             # Skip buffer
             self._dummy_overflow_buf = torch.cuda.IntTensor([0])
             self.multi_tensor_adagrad = amp_C.multi_tensor_adagrad
diff --git a/apex/optimizers/fused_lars.py b/apex/optimizers/fused_lars.py
index 3e60b2cce..744d558ff 100644
--- a/apex/optimizers/fused_lars.py
+++ b/apex/optimizers/fused_lars.py
@@ -32,7 +32,8 @@ def __init__(self, params, lr=required, momentum=0, dampening=0,
         self.eps = eps
 
         if multi_tensor_applier.available:
-            import amp_C
+            from apex.op_builder import AmpCBuilder
+            amp_C = AmpCBuilder().load()
             # Skip buffer
             self._dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device=self.param_groups[0]["params"][0].device)
             self.multi_tensor_l2norm = amp_C.multi_tensor_l2norm
diff --git a/apex/optimizers/fused_mixed_precision_lamb.py b/apex/optimizers/fused_mixed_precision_lamb.py
index 7ecda4f51..524100cb8 100644
--- a/apex/optimizers/fused_mixed_precision_lamb.py
+++ b/apex/optimizers/fused_mixed_precision_lamb.py
@@ -33,7 +33,8 @@ def __init__(self, params, lr=1e-3, step=0, bias_correction=True,
                 self.param_groups[idx][item] = group[item].to(device=device)
 
         if multi_tensor_applier.available and multi_tensor_applier_l2norm.available:
-            import amp_C
+            from apex.op_builder import AmpCBuilder
+            amp_C = AmpCBuilder().load()
             self.multi_tensor_l2norm=amp_C.multi_tensor_l2norm_mp
             # Skip buffer
             self._dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device=device)
diff --git a/apex/optimizers/fused_novograd.py b/apex/optimizers/fused_novograd.py
index b3ec5acb9..2f74f627a 100644
--- a/apex/optimizers/fused_novograd.py
+++ b/apex/optimizers/fused_novograd.py
@@ -77,7 +77,8 @@ def __init__(self, params, lr=1e-3, bias_correction=True,
                         init_zero=init_zero)
         super(FusedNovoGrad, self).__init__(params, defaults)
         if multi_tensor_applier.available:
-            import amp_C
+            from apex.op_builder import AmpCBuilder
+            amp_C = AmpCBuilder().load()
             # Skip buffer
 
             # Creating the overflow buffer on the same device as the params tensors.
diff --git a/apex/optimizers/fused_sgd.py b/apex/optimizers/fused_sgd.py
index 88f26f27a..7c6481bb7 100644
--- a/apex/optimizers/fused_sgd.py
+++ b/apex/optimizers/fused_sgd.py
@@ -98,7 +98,8 @@ def __init__(self, params, lr=required, momentum=0, dampening=0,
         self.set_grad_none = set_grad_none
 
         if multi_tensor_applier.available:
-            import amp_C
+            from apex.op_builder import AmpCBuilder
+            amp_C = AmpCBuilder().load()
             # Skip buffer
             self._dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device=self.param_groups[0]["params"][0].device)
             self.multi_tensor_sgd = amp_C.multi_tensor_sgd

From 8973402b70e95b949274c2a8a5f065ca321d7b25 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 17 Jul 2025 15:32:37 +0000
Subject: [PATCH 28/79] make fused adam as a jit module

---
 apex/contrib/optimizers/fused_adam.py |  3 ++-
 op_builder/fused_adam.py              | 31 +++++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)
 create mode 100644 op_builder/fused_adam.py

diff --git a/apex/contrib/optimizers/fused_adam.py b/apex/contrib/optimizers/fused_adam.py
index a823e7be6..5c2f73f23 100644
--- a/apex/contrib/optimizers/fused_adam.py
+++ b/apex/contrib/optimizers/fused_adam.py
@@ -41,7 +41,8 @@ def __init__(self, params,
                  weight_decay=0., max_grad_norm=0., amsgrad=False, use_mt=False,
                  amp_scale_adjustment=1.0):
         global fused_adam_cuda
-        fused_adam_cuda = importlib.import_module("fused_adam_cuda")
+        from apex.op_builder import FusedAdamBuilder
+        fused_adam_cuda = FusedAdamBuilder().load()
 
         self._use_multi_tensor = False
         if use_mt:
diff --git a/op_builder/fused_adam.py b/op_builder/fused_adam.py
new file mode 100644
index 000000000..ed4a9369b
--- /dev/null
+++ b/op_builder/fused_adam.py
@@ -0,0 +1,31 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class FusedAdamBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_FUSED_ADAM'
+    NAME = "fused_adam_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ['apex/contrib/csrc/optimizers/fused_adam_cuda.cpp',
+                'apex/contrib/csrc/optimizers/fused_adam_cuda_kernel.cu']
+
+    def include_paths(self):
+        return ['contrib/csrc/']
+        
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = ['-O3'] + self.version_dependent_macros()
+        if not self.is_rocm_pytorch():
+            nvcc_flags += ['--use_fast_math']
+        return nvcc_flags
\ No newline at end of file

From 3b38cb83d2a81c47667d23edc599fb7ca8015512 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 17 Jul 2025 15:49:13 +0000
Subject: [PATCH 29/79] add a few uses of amp_C jit module

---
 apex/optimizers/fused_adam.py | 3 ++-
 apex/optimizers/fused_lamb.py | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/apex/optimizers/fused_adam.py b/apex/optimizers/fused_adam.py
index 2ecfc077d..636ba9de6 100644
--- a/apex/optimizers/fused_adam.py
+++ b/apex/optimizers/fused_adam.py
@@ -107,7 +107,8 @@ def __init__(self, params, lr=1e-3, bias_correction=True,
             self._step_supports_amp_scaling = True
 
         if multi_tensor_applier.available:
-            import amp_C
+            from apex.op_builder import AmpCBuilder
+            amp_C = AmpCBuilder().load()
             # Skip buffer
             self._dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device='cuda')
             self.multi_tensor_adam = amp_C.multi_tensor_adam
diff --git a/apex/optimizers/fused_lamb.py b/apex/optimizers/fused_lamb.py
index a77e0cd54..d28b0fc7a 100644
--- a/apex/optimizers/fused_lamb.py
+++ b/apex/optimizers/fused_lamb.py
@@ -73,7 +73,8 @@ def __init__(self, params, lr=1e-3, bias_correction=True,
                         max_grad_norm=max_grad_norm)
         super(FusedLAMB, self).__init__(params, defaults)
         if multi_tensor_applier.available and multi_tensor_applier_l2norm.available:
-            import amp_C
+            from apex.op_builder import AmpCBuilder
+            amp_C = AmpCBuilder().load()
             self.multi_tensor_l2norm=amp_C.multi_tensor_l2norm
             # Skip buffer
             self._dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device=self.param_groups[0]["params"][0].device)

From 2d29c4caa6fb8c4fa84a1f4bb0627b68b2b057e1 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 17 Jul 2025 15:59:51 +0000
Subject: [PATCH 30/79] fix the issue with fused adam jit module

---
 op_builder/fused_adam.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/op_builder/fused_adam.py b/op_builder/fused_adam.py
index ed4a9369b..4a398ae04 100644
--- a/op_builder/fused_adam.py
+++ b/op_builder/fused_adam.py
@@ -14,11 +14,12 @@ def absolute_name(self):
         return f'apex.{self.NAME}'
 
     def sources(self):
-        return ['apex/contrib/csrc/optimizers/fused_adam_cuda.cpp',
-                'apex/contrib/csrc/optimizers/fused_adam_cuda_kernel.cu']
+        return ['contrib/csrc/optimizers/fused_adam_cuda.cpp',
+                'contrib/csrc/optimizers/fused_adam_cuda_kernel.cu']
 
     def include_paths(self):
-        return ['contrib/csrc/']
+        return ['contrib/csrc/',
+                'csrc']
         
     def cxx_args(self):
         args = super().cxx_args()

From 742c3b3e12b622b4d07a86140ff1566ab680cdaa Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 17 Jul 2025 16:26:50 +0000
Subject: [PATCH 31/79] make fused lamb as jit module

---
 apex/contrib/optimizers/fused_lamb.py |  6 +++--
 op_builder/fused_lamb.py              | 33 +++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 2 deletions(-)
 create mode 100644 op_builder/fused_lamb.py

diff --git a/apex/contrib/optimizers/fused_lamb.py b/apex/contrib/optimizers/fused_lamb.py
index 81d868228..3b3e66d7d 100644
--- a/apex/contrib/optimizers/fused_lamb.py
+++ b/apex/contrib/optimizers/fused_lamb.py
@@ -73,10 +73,12 @@ def __init__(self, params, lr=1e-3, bias_correction=True,
                         max_grad_norm=max_grad_norm)
         super(FusedLAMB, self).__init__(params, defaults)
         if multi_tensor_applier.available:
-            import amp_C
+            from apex.op_builder import AmpCBuilder
+            amp_C = AmpCBuilder().load()
             self.multi_tensor_l2norm=amp_C.multi_tensor_l2norm
             self._dummy_overflow_buf = torch.cuda.IntTensor([0])
-            fused_lamb_cuda = importlib.import_module("fused_lamb_cuda")
+            from apex.op_builder import FusedLambBuilder
+            fused_lamb_cuda = FusedLambBuilder().load()
             self.multi_tensor_lamb = fused_lamb_cuda.lamb
         else:
             raise RuntimeError('apex.contrib.optimizers.FusedLAMB requires cuda extensions')
diff --git a/op_builder/fused_lamb.py b/op_builder/fused_lamb.py
new file mode 100644
index 000000000..db14bec13
--- /dev/null
+++ b/op_builder/fused_lamb.py
@@ -0,0 +1,33 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class FusedLambBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_FUSED_LAMB'
+    NAME = "fused_lamb_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ['contrib/csrc/optimizers/fused_lamb_cuda.cpp',
+                'contrib/csrc/optimizers/fused_lamb_cuda_kernel.cu',
+                'csrc/multi_tensor_l2norm_kernel.cu']
+
+    def include_paths(self):
+        return ['contrib/csrc/',
+                'csrc']
+        
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = ['-O3'] + self.version_dependent_macros()
+        if not self.is_rocm_pytorch():
+            nvcc_flags += ['--use_fast_math']
+        return nvcc_flags
\ No newline at end of file

From a73b33dc6a2a7ca4f17dfb97478e4df453397664 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 17 Jul 2025 18:24:04 +0000
Subject: [PATCH 32/79] make distributed adam as jit module

---
 .../optimizers/distributed_fused_adam.py      |  7 ++--
 op_builder/distributed_adam.py                | 32 +++++++++++++++++++
 2 files changed, 36 insertions(+), 3 deletions(-)
 create mode 100644 op_builder/distributed_adam.py

diff --git a/apex/contrib/optimizers/distributed_fused_adam.py b/apex/contrib/optimizers/distributed_fused_adam.py
index 65da11218..7ff87f791 100644
--- a/apex/contrib/optimizers/distributed_fused_adam.py
+++ b/apex/contrib/optimizers/distributed_fused_adam.py
@@ -28,8 +28,9 @@
     nccl_allocator = None
 
 from apex.multi_tensor_apply import multi_tensor_applier
-import amp_C
-import distributed_adam_cuda
+from apex.op_builder import AmpCBuilder, DistributedAdamBuilder, FusedAdamBuilder
+amp_C = AmpCBuilder().load()
+distributed_adam_cuda = DistributedAdamBuilder().load()
 
 # Fallback to private functions if using PyTorch <1.13.0
 try:
@@ -126,7 +127,7 @@ def _coalescing_manager_append_work(
 # Import optional CUDA kernels
 _FOUND_DEPRECATED_FUSED_ADAM: bool = False
 try:
-    import fused_adam_cuda
+    fused_adam_cuda = FusedAdamBuilder().load()
 
     _FOUND_DEPRECATED_FUSED_ADAM = True
 except ImportError:
diff --git a/op_builder/distributed_adam.py b/op_builder/distributed_adam.py
new file mode 100644
index 000000000..0390ae75e
--- /dev/null
+++ b/op_builder/distributed_adam.py
@@ -0,0 +1,32 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class DistributedAdamBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_DISTRIBUTED_ADAM'
+    NAME = "distributed_adam_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ['contrib/csrc/optimizers/multi_tensor_distopt_adam.cpp',
+                'contrib/csrc/optimizers/multi_tensor_distopt_adam_kernel.cu']
+
+    def include_paths(self):
+        return ['contrib/csrc/',
+                'csrc']
+        
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = ['-O3'] + self.version_dependent_macros()
+        if not self.is_rocm_pytorch():
+            nvcc_flags += ['--use_fast_math']
+        return nvcc_flags
\ No newline at end of file

From bc7c56ed9adb835158a2b23d1e53985baadf1ecd Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 17 Jul 2025 18:34:11 +0000
Subject: [PATCH 33/79] make distributed lamb as jit module

---
 .../optimizers/distributed_fused_lamb.py      |  9 +++---
 op_builder/distributed_lamb.py                | 32 +++++++++++++++++++
 2 files changed, 37 insertions(+), 4 deletions(-)
 create mode 100644 op_builder/distributed_lamb.py

diff --git a/apex/contrib/optimizers/distributed_fused_lamb.py b/apex/contrib/optimizers/distributed_fused_lamb.py
index 0925bd04a..c1a9ee12d 100644
--- a/apex/contrib/optimizers/distributed_fused_lamb.py
+++ b/apex/contrib/optimizers/distributed_fused_lamb.py
@@ -3,7 +3,8 @@
 import inspect
 import torch
 import importlib
-import amp_C
+from apex.op_builder import AmpCBuilder, DistributedLambBuilder, FusedAdamBuilder
+amp_C = AmpCBuilder().load()
 from apex.multi_tensor_apply import multi_tensor_applier
 
 import torch.distributed.distributed_c10d as c10d
@@ -113,14 +114,14 @@ def __init__(self, params,
         super(DistributedFusedLAMB, self).__init__(params, defaults)
 
         global fused_adam_cuda, distributed_lamb_cuda
-        fused_adam_cuda = importlib.import_module("fused_adam_cuda")
-        distributed_lamb_cuda = importlib.import_module("distributed_lamb_cuda")
+        fused_adam_cuda = FusedAdamBuilder().load()
+        distributed_lamb_cuda = DistributedLambBuilder().load()
 
         self._overflow_buf = torch.cuda.IntTensor([0])
         self._has_overflow = False
         self.multi_tensor_lamb_compute_update_term = distributed_lamb_cuda.multi_tensor_lamb_compute_update_term
         self.multi_tensor_lamb_update_weights = distributed_lamb_cuda.multi_tensor_lamb_update_weights
-        import amp_C
+        amp_C = AmpCBuilder().load()
         self.multi_tensor_l2norm = amp_C.multi_tensor_l2norm
 
         self._grad_averaging = grad_averaging
diff --git a/op_builder/distributed_lamb.py b/op_builder/distributed_lamb.py
new file mode 100644
index 000000000..e314dbc40
--- /dev/null
+++ b/op_builder/distributed_lamb.py
@@ -0,0 +1,32 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class DistributedLambBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_DISTRIBUTEDLAMB'
+    NAME = "distributed_lamb_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ['contrib/csrc/optimizers/multi_tensor_distopt_lamb.cpp',
+                'contrib/csrc/optimizers/multi_tensor_distopt_lamb_kernel.cu']
+
+    def include_paths(self):
+        return ['contrib/csrc/',
+                'csrc']
+        
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = ['-O3'] + self.version_dependent_macros()
+        if not self.is_rocm_pytorch():
+            nvcc_flags += ['--use_fast_math']
+        return nvcc_flags
\ No newline at end of file

From ac684d5e68330ec0fbe448f00b7f90ab3fdef31d Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Fri, 18 Jul 2025 10:43:00 +0000
Subject: [PATCH 34/79] add remaining amp_C uses with jit loader

---
 apex/amp/_process_optimizer.py              | 3 ++-
 apex/amp/scaler.py                          | 3 ++-
 apex/contrib/clip_grad/clip_grad.py         | 3 ++-
 apex/fp16_utils/fp16_optimizer.py           | 3 ++-
 apex/parallel/distributed.py                | 3 ++-
 apex/transformer/pipeline_parallel/utils.py | 3 ++-
 6 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/apex/amp/_process_optimizer.py b/apex/amp/_process_optimizer.py
index 66c4c3fdf..df621b87c 100644
--- a/apex/amp/_process_optimizer.py
+++ b/apex/amp/_process_optimizer.py
@@ -338,7 +338,8 @@ def _process_optimizer(optimizer, properties):
 
     # TODO:  Centralize exposure and import error checking for the C backend.
     if multi_tensor_applier.available:
-        import amp_C
+        from apex.op_builder import AmpCBuilder
+        amp_C = AmpCBuilder().load()
         optimizer._amp_stash.multi_tensor_scale = amp_C.multi_tensor_scale
         optimizer._amp_stash.multi_tensor_l2norm = amp_C.multi_tensor_l2norm
         optimizer._amp_stash.dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device='cuda')
diff --git a/apex/amp/scaler.py b/apex/amp/scaler.py
index c11f70398..75e0afbb4 100644
--- a/apex/amp/scaler.py
+++ b/apex/amp/scaler.py
@@ -64,7 +64,8 @@ def __init__(self,
         self._has_overflow = False
         self._overflow_buf = torch.tensor([0], dtype=torch.int, device='cuda')
         if multi_tensor_applier.available:
-            import amp_C
+            from apex.op_builder import AmpCBuilder
+            amp_C = AmpCBuilder().load()
             LossScaler.has_fused_kernel = multi_tensor_applier.available
             LossScaler.multi_tensor_scale_cuda = amp_C.multi_tensor_scale
             LossScaler.multi_tensor_axpby_cuda = amp_C.multi_tensor_axpby
diff --git a/apex/contrib/clip_grad/clip_grad.py b/apex/contrib/clip_grad/clip_grad.py
index b6411352b..666df350e 100644
--- a/apex/contrib/clip_grad/clip_grad.py
+++ b/apex/contrib/clip_grad/clip_grad.py
@@ -4,7 +4,8 @@
 
 _kernel_import_succeeded = False
 try:
-    import amp_C
+    from apex.op_builder import AmpCBuilder
+    amp_C = AmpCBuilder().load()
     from apex.multi_tensor_apply import multi_tensor_applier
     _kernel_import_succeeded = True
 except ImportError:
diff --git a/apex/fp16_utils/fp16_optimizer.py b/apex/fp16_utils/fp16_optimizer.py
index 7c0dd397f..feb3e3ed6 100755
--- a/apex/fp16_utils/fp16_optimizer.py
+++ b/apex/fp16_utils/fp16_optimizer.py
@@ -101,7 +101,8 @@ def __init__(self,
 
         # TODO:  Centralize exposure and import error checking for the C backend.
         if multi_tensor_applier.available:
-            import amp_C
+            from apex.op_builder import AmpCBuilder
+            amp_C = AmpCBuilder().load()
             self.multi_tensor_scale = amp_C.multi_tensor_scale
             self._dummy_overflow_buf = torch.cuda.IntTensor([0]);
 
diff --git a/apex/parallel/distributed.py b/apex/parallel/distributed.py
index 6aa6a6e8a..a110e38e3 100644
--- a/apex/parallel/distributed.py
+++ b/apex/parallel/distributed.py
@@ -245,7 +245,8 @@ def __init__(self,
 
         if multi_tensor_applier.available:
             # TODO:  I really need to centralize the C++ backed imports
-            import amp_C
+            from apex.op_builder import AmpCBuilder
+            amp_C = AmpCBuilder().load()
             self.multi_tensor_scale = amp_C.multi_tensor_scale
             self._overflow_buf = torch.cuda.IntTensor([0])
 
diff --git a/apex/transformer/pipeline_parallel/utils.py b/apex/transformer/pipeline_parallel/utils.py
index ae550d0b9..268bcdca8 100644
--- a/apex/transformer/pipeline_parallel/utils.py
+++ b/apex/transformer/pipeline_parallel/utils.py
@@ -25,7 +25,8 @@
 from apex.transformer.microbatches import build_num_microbatches_calculator
 from apex.transformer.pipeline_parallel._timers import _Timers
 if multi_tensor_applier.available:
-    import amp_C
+    from apex.op_builder import AmpCBuilder
+    amp_C = AmpCBuilder().load()
 
 
 _GLOBAL_ARGS = None

From 18ba69652c4add64cc4bd8a45d6dd0e6f583ba82 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Tue, 22 Jul 2025 07:42:58 +0000
Subject: [PATCH 35/79] add remaining usage of apexC jit module

---
 apex/parallel/distributed.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/apex/parallel/distributed.py b/apex/parallel/distributed.py
index a110e38e3..a4faadd89 100644
--- a/apex/parallel/distributed.py
+++ b/apex/parallel/distributed.py
@@ -13,7 +13,8 @@
 def import_flatten_impl():
     global flatten_impl, unflatten_impl, imported_flatten_impl
     try:
-        import apex_C
+        from apex.op_builder import ApexCBuilder
+        apex_C = ApexCBuilder().load()
         flatten_impl = apex_C.flatten
         unflatten_impl = apex_C.unflatten
     except ImportError:

From e79a0286b9dbc905053456761572d699e6240f29 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Tue, 22 Jul 2025 07:43:46 +0000
Subject: [PATCH 36/79] make nccl p2p module as jit compile

---
 op_builder/nccl_p2p.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 op_builder/nccl_p2p.py

diff --git a/op_builder/nccl_p2p.py b/op_builder/nccl_p2p.py
new file mode 100644
index 000000000..3e2d55e2b
--- /dev/null
+++ b/op_builder/nccl_p2p.py
@@ -0,0 +1,25 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class NCCLP2PBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_NCCL_P2P'
+    NAME = "nccl_p2p_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ["contrib/csrc/nccl_p2p/nccl_p2p_cuda.cu",
+                "contrib/csrc/nccl_p2p/nccl_p2p.cpp"]
+
+    def include_paths(self):
+        return ['contrib/csrc/']
+        
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros() + self.generator_args()
\ No newline at end of file

From 3bd3045c865842d38cd5555941b18a32e36be271 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Tue, 22 Jul 2025 07:44:47 +0000
Subject: [PATCH 37/79] make peer memory module as jit compile

---
 .../peer_halo_exchange_module_tests.py        |  3 ++-
 .../peer_memory/peer_halo_exchanger_1d.py     |  3 ++-
 apex/contrib/peer_memory/peer_memory.py       |  3 ++-
 op_builder/peer_memory.py                     | 25 +++++++++++++++++++
 4 files changed, 31 insertions(+), 3 deletions(-)
 create mode 100644 op_builder/peer_memory.py

diff --git a/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py b/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py
index bd85354af..135482186 100644
--- a/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py
+++ b/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py
@@ -1,6 +1,7 @@
 import torch
 from apex.contrib.peer_memory import PeerMemoryPool, PeerHaloExchanger1d
-import peer_memory_cuda as pm
+from apex.op_builder import PeerMemoryBuilder
+pm = PeerMemoryBuilder().load()
 
 # How to run:
 # torchrun --nproc_per_node <num-GPU> <this-python-prog>
diff --git a/apex/contrib/peer_memory/peer_halo_exchanger_1d.py b/apex/contrib/peer_memory/peer_halo_exchanger_1d.py
index cc25693ce..773ca6f4b 100644
--- a/apex/contrib/peer_memory/peer_halo_exchanger_1d.py
+++ b/apex/contrib/peer_memory/peer_halo_exchanger_1d.py
@@ -1,6 +1,7 @@
 import torch
 from apex.contrib.peer_memory import PeerMemoryPool
-import peer_memory_cuda as pm
+from apex.op_builder import PeerMemoryBuilder
+pm = PeerMemoryBuilder().load()
 
 class PeerHaloExchanger1d:
     def __init__(self, ranks, rank_in_group, peer_pool, half_halo):
diff --git a/apex/contrib/peer_memory/peer_memory.py b/apex/contrib/peer_memory/peer_memory.py
index adb218219..5f9e07773 100644
--- a/apex/contrib/peer_memory/peer_memory.py
+++ b/apex/contrib/peer_memory/peer_memory.py
@@ -1,6 +1,7 @@
 import torch
 import numpy as np
-import peer_memory_cuda as pm
+from apex.op_builder import PeerMemoryBuilder
+pm = PeerMemoryBuilder().load()
 
 class PeerMemoryPool(object):
 
diff --git a/op_builder/peer_memory.py b/op_builder/peer_memory.py
new file mode 100644
index 000000000..64a1616c2
--- /dev/null
+++ b/op_builder/peer_memory.py
@@ -0,0 +1,25 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class PeerMemoryBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_PEER_MEMORY'
+    NAME = "peer_memory_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ["contrib/csrc/peer_memory/peer_memory_cuda.cu",
+                "contrib/csrc/peer_memory/peer_memory.cpp"]
+
+    def include_paths(self):
+        return ['contrib/csrc/']
+        
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros() + self.generator_args()
\ No newline at end of file

From 8ba059eaf5e9d37bb552a038506079fb321a7115 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Tue, 22 Jul 2025 07:54:12 +0000
Subject: [PATCH 38/79] add code to check for minimum nccl version to compile
 nccl allocator module

---
 op_builder/builder.py        | 8 ++++++++
 op_builder/nccl_allocator.py | 9 ++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/op_builder/builder.py b/op_builder/builder.py
index 032640e94..b35ab4bea 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -860,6 +860,14 @@ def nccl_args(self):
             nccl_library = ["-lrccl"]
         return nccl_library
 
+    def nccl_version(self):
+        from torch.utils.cpp_extension import load
+        _nccl_version_getter = load(
+            name="_nccl_version_getter",
+            sources=["contrib/csrc/nccl_p2p/nccl_version.cpp", "contrib/csrc/nccl_p2p/nccl_version_check.cu"],
+        )
+        return _nccl_version_getter.get_nccl_version()
+
 class TorchCPUOpBuilder(CUDAOpBuilder):
 
     def get_cuda_lib64_path(self):
diff --git a/op_builder/nccl_allocator.py b/op_builder/nccl_allocator.py
index 10bffcdb2..326a3618c 100644
--- a/op_builder/nccl_allocator.py
+++ b/op_builder/nccl_allocator.py
@@ -24,4 +24,11 @@ def cxx_args(self):
         return args + self.version_dependent_macros() + self.generator_args()
 
     def nvcc_args(self):
-        return self.nccl_args()
\ No newline at end of file
+        return self.nccl_args()
+
+    def is_compatible(self):
+        available_nccl_version = self.nccl_version()
+        if available_nccl_version >= (2, 19):
+            return True
+        else:
+            return False
\ No newline at end of file

From 4f417e622029e01bc6fce22f424752d66d8a7194 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Tue, 22 Jul 2025 11:42:35 +0000
Subject: [PATCH 39/79] add provision to provide APEX_CPP_OPS=1  and
 APEX_CUDA_OPS=1 as replacement for --cpp_ext --cuda_ext command line
 arguments for building specific extensions in apex, save these settings for
 later use

---
 op_builder/amp_C.py                           |  1 +
 op_builder/apex_C.py                          |  1 +
 op_builder/bnp.py                             |  1 +
 op_builder/distributed_adam.py                |  1 +
 op_builder/distributed_lamb.py                |  3 +-
 op_builder/fast_multihead_attn.py             |  1 +
 op_builder/focal_loss.py                      |  1 +
 op_builder/fused_adam.py                      |  1 +
 op_builder/fused_bias_swiglu.py               |  3 +-
 op_builder/fused_dense.py                     |  1 +
 op_builder/fused_index_mul_2d.py              |  1 +
 op_builder/fused_lamb.py                      |  1 +
 op_builder/fused_layer_norm.py                |  1 +
 op_builder/fused_rope.py                      |  3 +-
 op_builder/fused_weight_gradient_mlp.py       |  4 +-
 .../generic_scaled_masked_softmax_cuda.py     |  4 +-
 op_builder/mlp.py                             |  1 +
 op_builder/nccl_allocator.py                  |  1 +
 op_builder/nccl_p2p.py                        |  1 +
 op_builder/peer_memory.py                     |  1 +
 op_builder/scaled_masked_softmax_cuda.py      |  4 +-
 op_builder/scaled_softmax_cuda.py             |  4 +-
 ...scaled_upper_triang_masked_softmax_cuda.py |  3 +-
 op_builder/syncbn.py                          |  1 +
 op_builder/transducer_joint.py                |  1 +
 op_builder/transducer_loss.py                 |  1 +
 op_builder/xentropy.py                        |  1 +
 setup.py                                      | 53 ++++++++++---------
 28 files changed, 64 insertions(+), 36 deletions(-)

diff --git a/op_builder/amp_C.py b/op_builder/amp_C.py
index 563be0dc6..284d2a808 100644
--- a/op_builder/amp_C.py
+++ b/op_builder/amp_C.py
@@ -5,6 +5,7 @@
 
 class AmpCBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_AMP_C'
+    INCLUDE_FLAG = "APEX_CUDA_OPS"
     NAME = "amp_C"
 
     def __init__(self):
diff --git a/op_builder/apex_C.py b/op_builder/apex_C.py
index c0f3aa098..2d458602e 100644
--- a/op_builder/apex_C.py
+++ b/op_builder/apex_C.py
@@ -5,6 +5,7 @@
 
 class ApexCBuilder(TorchCPUOpBuilder):
     BUILD_VAR = 'APEX_BUILD_C'
+    INCLUDE_FLAG = "APEX_CPP_OPS"
     NAME = "apex_C"
 
     def __init__(self):
diff --git a/op_builder/bnp.py b/op_builder/bnp.py
index 47fe1fdb3..e15db8aa3 100644
--- a/op_builder/bnp.py
+++ b/op_builder/bnp.py
@@ -5,6 +5,7 @@
 
 class BnpBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_BNP'
+    INCLUDE_FLAG = "APEX_CUDA_OPS"
     NAME = "bnp"
 
     def __init__(self):
diff --git a/op_builder/distributed_adam.py b/op_builder/distributed_adam.py
index 0390ae75e..2d142b397 100644
--- a/op_builder/distributed_adam.py
+++ b/op_builder/distributed_adam.py
@@ -5,6 +5,7 @@
 
 class DistributedAdamBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_DISTRIBUTED_ADAM'
+    INCLUDE_FLAG = "APEX_CUDA_OPS"
     NAME = "distributed_adam_cuda"
 
     def __init__(self):
diff --git a/op_builder/distributed_lamb.py b/op_builder/distributed_lamb.py
index e314dbc40..53b8a35ee 100644
--- a/op_builder/distributed_lamb.py
+++ b/op_builder/distributed_lamb.py
@@ -4,7 +4,8 @@
 
 
 class DistributedLambBuilder(CUDAOpBuilder):
-    BUILD_VAR = 'APEX_BUILD_DISTRIBUTEDLAMB'
+    BUILD_VAR = 'APEX_BUILD_DISTRIBUTED_LAMB'
+    INCLUDE_FLAG = "APEX_CUDA_OPS"
     NAME = "distributed_lamb_cuda"
 
     def __init__(self):
diff --git a/op_builder/fast_multihead_attn.py b/op_builder/fast_multihead_attn.py
index 737c18d2e..37a0eb09b 100644
--- a/op_builder/fast_multihead_attn.py
+++ b/op_builder/fast_multihead_attn.py
@@ -5,6 +5,7 @@
 
 class FastMultiheadAttnBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_FAST_MULTIHEAD_ATTN'
+    INCLUDE_FLAG = "APEX_CUDA_OPS"
     NAME = "fast_multihead_attn"
 
     def __init__(self):
diff --git a/op_builder/focal_loss.py b/op_builder/focal_loss.py
index af0fbf31b..14d0f89f1 100644
--- a/op_builder/focal_loss.py
+++ b/op_builder/focal_loss.py
@@ -5,6 +5,7 @@
 
 class FocalLossBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_FOCAL_LOSS'
+    INCLUDE_FLAG = "APEX_CUDA_OPS"
     NAME = "focal_loss_cuda"
 
     def __init__(self):
diff --git a/op_builder/fused_adam.py b/op_builder/fused_adam.py
index 4a398ae04..c352cf026 100644
--- a/op_builder/fused_adam.py
+++ b/op_builder/fused_adam.py
@@ -5,6 +5,7 @@
 
 class FusedAdamBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_FUSED_ADAM'
+    INCLUDE_FLAG = "APEX_CUDA_OPS"
     NAME = "fused_adam_cuda"
 
     def __init__(self):
diff --git a/op_builder/fused_bias_swiglu.py b/op_builder/fused_bias_swiglu.py
index 4b00b8212..77c1c58c6 100644
--- a/op_builder/fused_bias_swiglu.py
+++ b/op_builder/fused_bias_swiglu.py
@@ -4,6 +4,7 @@
 
 class FusedBiasSwiGLUBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_FUSED_BIAS_SWIGLU'
+    INCLUDE_FLAG = "APEX_CUDA_OPS"
     NAME = "fused_bias_swiglu"
 
     def __init__(self):
@@ -53,4 +54,4 @@ def nvcc_args(self):
                 print(f"Warning: Error processing PYTORCH_ROCM_ARCH: {e}")
                 print("Falling back to default architecture gfx906")
                 nvcc_flags += ['--offload-arch=gfx906']
-        return nvcc_flags
+        return nvcc_flags
\ No newline at end of file
diff --git a/op_builder/fused_dense.py b/op_builder/fused_dense.py
index bef56ee4b..3df1dfcac 100644
--- a/op_builder/fused_dense.py
+++ b/op_builder/fused_dense.py
@@ -5,6 +5,7 @@
 
 class FusedDenseBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_FUSED_DENSE'
+    INCLUDE_FLAG = "APEX_CUDA_OPS"
     NAME = "fused_dense_cuda"
 
     def __init__(self):
diff --git a/op_builder/fused_index_mul_2d.py b/op_builder/fused_index_mul_2d.py
index 2a99bcc51..081061be4 100644
--- a/op_builder/fused_index_mul_2d.py
+++ b/op_builder/fused_index_mul_2d.py
@@ -5,6 +5,7 @@
 
 class FusedIndexMul2dBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_FUSED_INDEX_MUL_2D'
+    INCLUDE_FLAG = "APEX_CUDA_OPS"
     NAME = "fused_index_mul_2d"
 
     def __init__(self):
diff --git a/op_builder/fused_lamb.py b/op_builder/fused_lamb.py
index db14bec13..3f4dc027f 100644
--- a/op_builder/fused_lamb.py
+++ b/op_builder/fused_lamb.py
@@ -5,6 +5,7 @@
 
 class FusedLambBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_FUSED_LAMB'
+    INCLUDE_FLAG = "APEX_CUDA_OPS"
     NAME = "fused_lamb_cuda"
 
     def __init__(self):
diff --git a/op_builder/fused_layer_norm.py b/op_builder/fused_layer_norm.py
index 4e0ca9c8a..f56c930aa 100644
--- a/op_builder/fused_layer_norm.py
+++ b/op_builder/fused_layer_norm.py
@@ -5,6 +5,7 @@
 
 class FusedLayerNormBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_FUSED_LAYER_NORM'
+    INCLUDE_FLAG = "APEX_CUDA_OPS"
     NAME = "fused_layer_norm_cuda"
 
     def __init__(self):
diff --git a/op_builder/fused_rope.py b/op_builder/fused_rope.py
index 74c580b02..4bfe3b632 100644
--- a/op_builder/fused_rope.py
+++ b/op_builder/fused_rope.py
@@ -5,6 +5,7 @@
 
 class FusedRopeBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_FUSED_ROPE'
+    INCLUDE_FLAG = "APEX_CUDA_OPS"
     NAME = "fused_rotary_positional_embedding"
 
     def __init__(self):
@@ -36,4 +37,4 @@ def nvcc_args(self):
                     '--expt-relaxed-constexpr',
                     '--expt-extended-lambda'
                 ])
-        return nvcc_flags
+        return nvcc_flags
\ No newline at end of file
diff --git a/op_builder/fused_weight_gradient_mlp.py b/op_builder/fused_weight_gradient_mlp.py
index 040c465a0..510a33046 100644
--- a/op_builder/fused_weight_gradient_mlp.py
+++ b/op_builder/fused_weight_gradient_mlp.py
@@ -2,6 +2,7 @@
 
 class FusedWeightGradientMlpCudaBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_FUSED_WEIGHT_GRADIENT_MLP'
+    INCLUDE_FLAG = "APEX_CUDA_OPS"
     NAME = "fused_weight_gradient_mlp_cuda"
 
     def __init__(self):
@@ -38,5 +39,4 @@ def nvcc_args(self):
                     '--expt-extended-lambda',
                     "--use_fast_math"
                 ]) + self.compute_capability_args()
-        return nvcc_flags
-
+        return nvcc_flags
\ No newline at end of file
diff --git a/op_builder/generic_scaled_masked_softmax_cuda.py b/op_builder/generic_scaled_masked_softmax_cuda.py
index 1cf963948..c263c2d46 100644
--- a/op_builder/generic_scaled_masked_softmax_cuda.py
+++ b/op_builder/generic_scaled_masked_softmax_cuda.py
@@ -2,6 +2,7 @@
 
 class GenericScaledMaskedSoftmaxCudaBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_GENERIC_SCALED_MASKED_SOFTMAX_CUDA'
+    INCLUDE_FLAG = "APEX_CUDA_OPS"
     NAME = "generic_scaled_masked_softmax_cuda"
 
     def __init__(self):
@@ -35,5 +36,4 @@ def nvcc_args(self):
                     '--expt-relaxed-constexpr',
                     '--expt-extended-lambda'
                 ])
-        return nvcc_flags
-
+        return nvcc_flags
\ No newline at end of file
diff --git a/op_builder/mlp.py b/op_builder/mlp.py
index dd55679e3..fb42e7d3d 100644
--- a/op_builder/mlp.py
+++ b/op_builder/mlp.py
@@ -5,6 +5,7 @@
 
 class MlpBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_MLP'
+    INCLUDE_FLAG = "APEX_CUDA_OPS"
     NAME = "mlp_cuda"
 
     def __init__(self):
diff --git a/op_builder/nccl_allocator.py b/op_builder/nccl_allocator.py
index 326a3618c..e687d3f82 100644
--- a/op_builder/nccl_allocator.py
+++ b/op_builder/nccl_allocator.py
@@ -5,6 +5,7 @@
 
 class NCCLAllocatorBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_NCCL_ALLOCATOR'
+    INCLUDE_FLAG = "APEX_CUDA_OPS"
     NAME = "_apex_nccl_allocator"
 
     def __init__(self):
diff --git a/op_builder/nccl_p2p.py b/op_builder/nccl_p2p.py
index 3e2d55e2b..ee9ed2bb5 100644
--- a/op_builder/nccl_p2p.py
+++ b/op_builder/nccl_p2p.py
@@ -5,6 +5,7 @@
 
 class NCCLP2PBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_NCCL_P2P'
+    INCLUDE_FLAG = "APEX_CUDA_OPS"
     NAME = "nccl_p2p_cuda"
 
     def __init__(self):
diff --git a/op_builder/peer_memory.py b/op_builder/peer_memory.py
index 64a1616c2..07e712339 100644
--- a/op_builder/peer_memory.py
+++ b/op_builder/peer_memory.py
@@ -5,6 +5,7 @@
 
 class PeerMemoryBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_PEER_MEMORY'
+    INCLUDE_FLAG = "APEX_CUDA_OPS"
     NAME = "peer_memory_cuda"
 
     def __init__(self):
diff --git a/op_builder/scaled_masked_softmax_cuda.py b/op_builder/scaled_masked_softmax_cuda.py
index ee482ecda..0e5e0bbcf 100644
--- a/op_builder/scaled_masked_softmax_cuda.py
+++ b/op_builder/scaled_masked_softmax_cuda.py
@@ -2,6 +2,7 @@
 
 class ScaledMaskedSoftmaxCudaBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_SCALED_MASKED_SOFTMAX_CUDA'
+    INCLUDE_FLAG = "APEX_CUDA_OPS"
     NAME = "scaled_masked_softmax_cuda"
 
     def __init__(self):
@@ -36,5 +37,4 @@ def nvcc_args(self):
                     '--expt-relaxed-constexpr',
                     '--expt-extended-lambda'
                 ])
-        return nvcc_flags
-
+        return nvcc_flags
\ No newline at end of file
diff --git a/op_builder/scaled_softmax_cuda.py b/op_builder/scaled_softmax_cuda.py
index 21c75eb32..89eb4fb08 100644
--- a/op_builder/scaled_softmax_cuda.py
+++ b/op_builder/scaled_softmax_cuda.py
@@ -4,6 +4,7 @@
 
 class ScaledSoftmaxCudaBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_SCALED_SOFTMAX_CUDA'
+    INCLUDE_FLAG = "APEX_CUDA_OPS"
     NAME = "scaled_softmax_cuda"
 
     def __init__(self):
@@ -37,5 +38,4 @@ def nvcc_args(self):
                     '--expt-relaxed-constexpr',
                     '--expt-extended-lambda'
                 ])
-        return nvcc_flags
-
+        return nvcc_flags
\ No newline at end of file
diff --git a/op_builder/scaled_upper_triang_masked_softmax_cuda.py b/op_builder/scaled_upper_triang_masked_softmax_cuda.py
index 045cd75df..4b9d6797c 100644
--- a/op_builder/scaled_upper_triang_masked_softmax_cuda.py
+++ b/op_builder/scaled_upper_triang_masked_softmax_cuda.py
@@ -2,6 +2,7 @@
 
 class ScaledUpperTriangMaskedSoftmaxCudaBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_SCALED_UPPER_TRIANG_MASKED_SOFTMAX_CUDA'
+    INCLUDE_FLAG = "APEX_CUDA_OPS"
     NAME = "scaled_upper_triang_masked_softmax_cuda"
 
     def __init__(self):
@@ -35,4 +36,4 @@ def nvcc_args(self):
                     '--expt-relaxed-constexpr',
                     '--expt-extended-lambda'
                 ])
-        return nvcc_flags
+        return nvcc_flags
\ No newline at end of file
diff --git a/op_builder/syncbn.py b/op_builder/syncbn.py
index 1e640aab2..2dca996a8 100644
--- a/op_builder/syncbn.py
+++ b/op_builder/syncbn.py
@@ -5,6 +5,7 @@
 
 class SyncBnBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_SYNCBN'
+    INCLUDE_FLAG = "APEX_CUDA_OPS"
     NAME = "syncbn"
 
     def __init__(self):
diff --git a/op_builder/transducer_joint.py b/op_builder/transducer_joint.py
index 00e26c72d..248f5c985 100644
--- a/op_builder/transducer_joint.py
+++ b/op_builder/transducer_joint.py
@@ -4,6 +4,7 @@
 
 class TransducerJointBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_TRANSDUCER_JOINT'
+    INCLUDE_FLAG = "APEX_CUDA_OPS"
     NAME = "transducer_joint_cuda"
 
     def __init__(self):
diff --git a/op_builder/transducer_loss.py b/op_builder/transducer_loss.py
index cb25a350e..da3e5c461 100644
--- a/op_builder/transducer_loss.py
+++ b/op_builder/transducer_loss.py
@@ -4,6 +4,7 @@
 
 class TransducerLossBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_TRANSDUCER_LOSS'
+    INCLUDE_FLAG = "APEX_CUDA_OPS"
     NAME = "transducer_loss_cuda"
 
     def __init__(self):
diff --git a/op_builder/xentropy.py b/op_builder/xentropy.py
index fbe6f702c..107b2412c 100644
--- a/op_builder/xentropy.py
+++ b/op_builder/xentropy.py
@@ -5,6 +5,7 @@
 
 class XentropyBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_XENTROPY'
+    INCLUDE_FLAG = "APEX_CUDA_OPS"
     NAME = "xentropy_cuda"
 
     def __init__(self):
diff --git a/setup.py b/setup.py
index 55c1a39b9..caa970f25 100644
--- a/setup.py
+++ b/setup.py
@@ -217,30 +217,11 @@ def check_if_rocm_pytorch():
 
 extras = {}
 
-# Set up macros for forward/backward compatibility hack around
-# https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e
-# and
-# https://github.com/NVIDIA/apex/issues/456
-# https://github.com/pytorch/pytorch/commit/eb7b39e02f7d75c26d8a795ea8c7fd911334da7e#diff-4632522f237f1e4e728cb824300403ac
-version_ge_1_1 = []
-if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
-    version_ge_1_1 = ["-DVERSION_GE_1_1"]
-version_ge_1_3 = []
-if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
-    version_ge_1_3 = ["-DVERSION_GE_1_3"]
-version_ge_1_5 = []
-if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
-    version_ge_1_5 = ["-DVERSION_GE_1_5"]
-version_dependent_macros = version_ge_1_1 + version_ge_1_3 + version_ge_1_5
-
 if not IS_ROCM_PYTORCH:
     _, bare_metal_version = get_cuda_bare_metal_version(CUDA_HOME)
 else:
     _, bare_metal_version, bare_metal_minor  = get_rocm_bare_metal_version(ROCM_HOME)
 
-if IS_ROCM_PYTORCH and (ROCM_MAJOR >= 6):
-    version_dependent_macros += ["-DHIPBLAS_V2"] 
-
 
 if "--cpp_ext" in sys.argv or "--cuda_ext" in sys.argv:
     if TORCH_MAJOR == 0:
@@ -273,8 +254,17 @@ def command_exists(cmd):
 BUILD_OP_PLATFORM = 1 if sys.platform == "win32" else 0
 BUILD_OP_DEFAULT = int(get_env_if_set('APEX_BUILD_OPS', BUILD_OP_PLATFORM))
 print(f"APEX_BUILD_OPS={BUILD_OP_DEFAULT}")
+BUILD_CPP_OP = int(get_env_if_set('APEX_CPP_OPS', BUILD_OP_PLATFORM))
+BUILD_CUDA_OP = int(get_env_if_set('APEX_CUDA_OPS', BUILD_OP_PLATFORM))
+BUILD_AITER_OP = int(get_env_if_set('APEX_AITER_OPS', BUILD_OP_PLATFORM))
+build_flags = {
+    "APEX_BUILD_OPS" : BUILD_OP_DEFAULT,
+    "APEX_CPP_OPS" : BUILD_CPP_OP,
+    "APEX_OPS" : BUILD_CUDA_OP,
+    "APEX_AITER_OPS" : BUILD_AITER_OP
+    }
 
-ext_modules2 = []
+ext_modules = []
 
 def is_env_set(key):
     """
@@ -292,9 +282,17 @@ def op_enabled(op_name):
     env_var = op_envvar(op_name)
     return int(get_env_if_set(env_var, BUILD_OP_DEFAULT))
 
+def is_op_included(op_name):
+    #check if operation has BUILD_FLAG defined
+    assert hasattr(ALL_OPS[op_name], 'INCLUDE_FLAG'), \
+        f"{op_name} is missing INCLUDE_FLAG field"
+    include_flag = ALL_OPS[op_name].INCLUDE_FLAG
+    return get_env_if_set(include_flag, False)
+
 install_ops = dict.fromkeys(ALL_OPS.keys(), False)
 for op_name, builder in ALL_OPS.items():
     op_compatible = builder.is_compatible()
+    op_included = is_op_included(op_name)
 
     # If op is requested but not available, throw an error.
     if op_enabled(op_name) and not op_compatible:
@@ -303,14 +301,21 @@ def op_enabled(op_name):
             builder.warning(f"Skip pre-compile of incompatible {op_name}; One can disable {op_name} with {env_var}=0")
         continue
 
+    #if the necessary build flags for the op is not provided, then skip building it
+    if not op_included:
+        builder.warning(f"Skip pre-compile of incompatible {op_name}; Build flags for {op_name}: {ALL_OPS[op_name].INCLUDE_FLAG} not set")
+        del install_ops[op_name]
+        continue
+
     # If op is compatible but install is not enabled (JIT mode).
     if IS_ROCM_PYTORCH and op_compatible and not op_enabled(op_name):
         builder.hipify_extension()
 
     # If op install enabled, add builder to extensions.
+    # Also check if corresponding flags are checked
     if op_enabled(op_name) and op_compatible:
         install_ops[op_name] = op_enabled(op_name)
-        ext_modules2.append(builder.builder())
+        ext_modules.append(builder.builder())
 
 print(f'Install Ops={install_ops}')
     
@@ -391,6 +396,7 @@ def op_enabled(op_name):
     fd.write(f"git_hash='{git_hash}'\n")
     fd.write(f"git_branch='{git_branch}'\n")
     fd.write(f"installed_ops={install_ops}\n")
+    fd.write(f"build_flags={build_flags}\n")
     fd.write(f"accelerator_name='{accelerator_name}'\n")
     fd.write(f"torch_info={torch_info}\n")
 
@@ -406,10 +412,9 @@ def op_enabled(op_name):
         exclude=("build", "include", "tests", "dist", "docs", "tests", "examples", "apex.egg-info", "op_builder", "accelerator")
     ),
     description="PyTorch Extensions written by NVIDIA",
-    ext_modules=ext_modules2,
-    cmdclass={'build_ext': BuildExtension} if ext_modules2 else {},
+    ext_modules=ext_modules,
+    cmdclass={'build_ext': BuildExtension} if ext_modules else {},
     extras_require=extras,
     install_requires=required,
     include_package_data=True
 )
-

From 00d66d4f4400d23eb28a25d7cab5f8fa80788dd4 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Tue, 22 Jul 2025 12:02:52 +0000
Subject: [PATCH 40/79] check for minimum torch version for nccl allocator,
 check if the module is compatible other removed from installed ops list

---
 op_builder/builder.py        |  3 +++
 op_builder/nccl_allocator.py | 11 ++++++-----
 setup.py                     |  8 +++++++-
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/op_builder/builder.py b/op_builder/builder.py
index b35ab4bea..7262efa46 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -868,6 +868,9 @@ def nccl_version(self):
         )
         return _nccl_version_getter.get_nccl_version()
 
+    def torch_version(self):
+        return (TORCH_MAJOR, TORCH_MINOR)
+
 class TorchCPUOpBuilder(CUDAOpBuilder):
 
     def get_cuda_lib64_path(self):
diff --git a/op_builder/nccl_allocator.py b/op_builder/nccl_allocator.py
index e687d3f82..a1d35b90f 100644
--- a/op_builder/nccl_allocator.py
+++ b/op_builder/nccl_allocator.py
@@ -28,8 +28,9 @@ def nvcc_args(self):
         return self.nccl_args()
 
     def is_compatible(self):
-        available_nccl_version = self.nccl_version()
-        if available_nccl_version >= (2, 19):
-            return True
-        else:
-            return False
\ No newline at end of file
+        torch_version = self.torch_version()
+        if torch_version >= (2, 6):
+            available_nccl_version = self.nccl_version()
+            if available_nccl_version >= (2, 19):
+                return True
+        return False
\ No newline at end of file
diff --git a/setup.py b/setup.py
index caa970f25..b91f02923 100644
--- a/setup.py
+++ b/setup.py
@@ -303,7 +303,13 @@ def is_op_included(op_name):
 
     #if the necessary build flags for the op is not provided, then skip building it
     if not op_included:
-        builder.warning(f"Skip pre-compile of incompatible {op_name}; Build flags for {op_name}: {ALL_OPS[op_name].INCLUDE_FLAG} not set")
+        builder.warning(f"Skipping unsupported {op_name}; Build flags for {op_name}: {ALL_OPS[op_name].INCLUDE_FLAG} not set")
+        del install_ops[op_name]
+        continue
+
+    #check if the conditions for building the module are satisfied
+    if not op_compatible:
+        builder.warning(f"Skipping unsupported {op_name}; The conditions for building this module are not satisfied.")
         del install_ops[op_name]
         continue
 

From 705f6757dee9f767d7b6acd58aea074fa7accb87 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 24 Jul 2025 10:18:22 +0000
Subject: [PATCH 41/79] add build as a dependency to support wheel building

---
 requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 53ae8d256..478362844 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,4 +6,5 @@ pytest>=3.5.1
 packaging>=14.0
 matplotlib>=3.8
 pandas>=2.2.2
-py-cpuinfo
\ No newline at end of file
+py-cpuinfo
+build
\ No newline at end of file

From 783fbdea8898f372fbe084d2f80130b99637bfa7 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 24 Jul 2025 10:57:00 +0000
Subject: [PATCH 42/79] Replace is_compatible to check for installation
 conditions with is_supported, because there is an issue with loading nccl
 allocator

---
 op_builder/builder.py        |  9 +++++++++
 op_builder/nccl_allocator.py |  2 +-
 setup.py                     | 12 ++----------
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/op_builder/builder.py b/op_builder/builder.py
index 7262efa46..8611e252c 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -310,6 +310,12 @@ def is_compatible(self, verbose=False):
         '''
         return True
 
+    def is_supported(self):
+        '''
+        Check if all conditions are satisfied to build this op
+        '''
+        return True
+
     def extra_ldflags(self):
         return []
 
@@ -871,6 +877,9 @@ def nccl_version(self):
     def torch_version(self):
         return (TORCH_MAJOR, TORCH_MINOR)
 
+    def is_supported(self):
+        return super().is_supported()
+
 class TorchCPUOpBuilder(CUDAOpBuilder):
 
     def get_cuda_lib64_path(self):
diff --git a/op_builder/nccl_allocator.py b/op_builder/nccl_allocator.py
index a1d35b90f..0cbf27634 100644
--- a/op_builder/nccl_allocator.py
+++ b/op_builder/nccl_allocator.py
@@ -27,7 +27,7 @@ def cxx_args(self):
     def nvcc_args(self):
         return self.nccl_args()
 
-    def is_compatible(self):
+    def is_supported(self):
         torch_version = self.torch_version()
         if torch_version >= (2, 6):
             available_nccl_version = self.nccl_version()
diff --git a/setup.py b/setup.py
index b91f02923..40288c128 100644
--- a/setup.py
+++ b/setup.py
@@ -58,15 +58,6 @@
 if os.path.exists(os.path.join(torch_dir, "include", "ATen", "Atomic.cuh")):
     found_aten_atomic_header = True
 
-def raise_if_cuda_home_none(global_option: str) -> None:
-    if CUDA_HOME is not None or ROCM_HOME is not None:
-        return
-    raise RuntimeError(
-        f"{global_option} was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  "
-        "If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, "
-        "only images whose names contain 'devel' will provide nvcc."
-    )
-
 def get_cuda_bare_metal_version(cuda_dir):
     raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
     output = raw_output.split()
@@ -293,6 +284,7 @@ def is_op_included(op_name):
 for op_name, builder in ALL_OPS.items():
     op_compatible = builder.is_compatible()
     op_included = is_op_included(op_name)
+    op_supported = builder.is_supported()
 
     # If op is requested but not available, throw an error.
     if op_enabled(op_name) and not op_compatible:
@@ -308,7 +300,7 @@ def is_op_included(op_name):
         continue
 
     #check if the conditions for building the module are satisfied
-    if not op_compatible:
+    if not op_supported:
         builder.warning(f"Skipping unsupported {op_name}; The conditions for building this module are not satisfied.")
         del install_ops[op_name]
         continue

From c36641761a66216ed510a9722ec85a998d3feff8 Mon Sep 17 00:00:00 2001
From: Sriram <sriramkumar.kishorekumar@amd.com>
Date: Thu, 24 Jul 2025 06:57:19 -0500
Subject: [PATCH 43/79] Similar to pytorch we create a make command to install
 aiter, that the user can use. There will be no building aiter in the setup.py

---
 Makefile | 11 +++++++++++
 setup.py |  2 --
 2 files changed, 11 insertions(+), 2 deletions(-)
 create mode 100644 Makefile

diff --git a/Makefile b/Makefile
new file mode 100644
index 000000000..9caf57155
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,11 @@
+PYTHON = python3
+PIP = $(PYTHON) -m pip
+
+clean: # This will remove ALL build folders.
+	@rm -r build/
+	@rm -r dist/
+	@rm -r *.egg-info
+aiter:
+	$(PIP) uninstall -y aiter
+	cd third_party/aiter && $(PIP) install . --no-build-isolation --no-deps
+	
diff --git a/setup.py b/setup.py
index 40288c128..e3167b31f 100644
--- a/setup.py
+++ b/setup.py
@@ -247,12 +247,10 @@ def command_exists(cmd):
 print(f"APEX_BUILD_OPS={BUILD_OP_DEFAULT}")
 BUILD_CPP_OP = int(get_env_if_set('APEX_CPP_OPS', BUILD_OP_PLATFORM))
 BUILD_CUDA_OP = int(get_env_if_set('APEX_CUDA_OPS', BUILD_OP_PLATFORM))
-BUILD_AITER_OP = int(get_env_if_set('APEX_AITER_OPS', BUILD_OP_PLATFORM))
 build_flags = {
     "APEX_BUILD_OPS" : BUILD_OP_DEFAULT,
     "APEX_CPP_OPS" : BUILD_CPP_OP,
     "APEX_OPS" : BUILD_CUDA_OP,
-    "APEX_AITER_OPS" : BUILD_AITER_OP
     }
 
 ext_modules = []

From 43632d7d0058ff4fe391a1fbc9a4e3bcc5f4037c Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 24 Jul 2025 15:17:44 +0000
Subject: [PATCH 44/79] update extension import test so that it considers jit
 compile extensions

---
 tests/test_extension_import.py | 89 ++++++++++++++++++++++++++++++----
 1 file changed, 80 insertions(+), 9 deletions(-)

diff --git a/tests/test_extension_import.py b/tests/test_extension_import.py
index 153254ddd..72d88688e 100644
--- a/tests/test_extension_import.py
+++ b/tests/test_extension_import.py
@@ -2,15 +2,17 @@
 import os
 import subprocess
 import sys
-
+import site
+import ast
+from apex.op_builder.all_ops import ALL_OPS
 
 
 class TestExtensionImport(unittest.TestCase):
 
-    def get_extensions_list(self):
-        """
-        This method reads setup.py and gets the list of extensions from the setup.py file
-        """
+    def __init__(self, *args, **kwargs):
+        super(TestExtensionImport, self).__init__(*args, **kwargs)
+
+        self.jit_info_file = "apex/git_version_info_installed.py"
 
         #find the absolute path of this file
         current_file_path = os.path.abspath(__file__)
@@ -21,9 +23,24 @@ def get_extensions_list(self):
         #apex folder
         parent_folder_path = os.path.dirname(parent_folder_path)
         self.parent_folder_path = parent_folder_path
+
+    def is_jit_modules_mode(self):
+        """
+        This method checks if the file git_version_info_installed.py exists
+        """
+        jit_file_path = os.path.join(site.getsitepackages()[0], self.jit_info_file)
+        #print ("jit_file_path", jit_file_path)
+        mode = os.path.exists(jit_file_path)
+        print ("jit_mode", mode)
+        return mode
+
+    def get_extensions_list_from_setup(self):
+        """
+        This method reads setup.py and gets the list of extensions from the setup.py file
+        """
         
         #get setup.py file contents
-        setup_path = os.path.join(parent_folder_path, "setup.py")
+        setup_path = os.path.join(self.parent_folder_path, "setup.py")
 
         #read setup_path contents
         with open(setup_path, 'r') as f:
@@ -62,6 +79,21 @@ def get_extensions_list(self):
         return extensions
 
 
+    def get_jit_modules(self):
+        """
+        This method reads the jit file and extracts installed_ops dictionary
+        """
+        jit_info_path = os.path.join(site.getsitepackages()[0], self.jit_info_file)
+        with open(jit_info_path, 'r') as f:
+            lines = f.readlines()
+        for line in lines:
+            if "installed_ops" in line:
+                ops_list = line[len("installed_ops") + 1 : ]
+                ops_list = ast.literal_eval(ops_list)
+                #print ("op_list", ops_list)
+                return list(ops_list.keys())
+        return {}
+
     def get_environment(self):
         """
         This method retrieves the environment for testing import
@@ -122,10 +154,46 @@ def check_extension_import(self, extension_name, env):
             print(f"Error testing import for {extension_name}: {e}")
             return False, str(e)
 
+    def check_jit_extension_import(self, extension_name, env):
+        all_ops = dict.fromkeys(ALL_OPS.keys(), False)
+        #get the builder for that extension
+        builder = ALL_OPS[extension_name]
+        builder_name = type(builder).__name__
+        #print ("----builder_name-----", builder_name)
+
+        #increase timeout
+        timeout = 60 * 60
+        try:
+            # Run Python subprocess to test the import
+            result = subprocess.run([
+                sys.executable, '-c', 
+                'from apex.op_builder import ' + builder_name + 
+                '\n' + builder_name + "().load()"
+            ], capture_output=True, text=True, timeout=timeout, env=env)
+            print ("result.stdout", result.stdout, result.stderr)
+            # Check if subprocess completed successfully
+            if result.returncode != 0 and "Error" in result.stderr:
+                return False, result.stderr
+            else:
+                return True, ""
+                
+        except subprocess.TimeoutExpired:
+            print(f"Import test timed out for {extension_name}")
+            return False, "Timeout"
+        except Exception as e:
+            print(f"Error testing import for {extension_name}: {e}")
+            return False, str(e)
+
 
     def test_extensions_import(self):
-        #get the list of extensions
-        extensions = self.get_extensions_list()
+        #check the extensions mode
+        jit_mode = self.is_jit_modules_mode()
+
+        if not jit_mode:
+            #get the list of extensions from setup.py
+            extensions = self.get_extensions_list_from_setup()
+        else:
+            extensions = self.get_jit_modules()
 
         #get environment
         env = self.get_environment()
@@ -135,7 +203,10 @@ def test_extensions_import(self):
         for extension in extensions:
             print ("checking extension", extension)
             with self.subTest(extension=extension):
-                success, error_message = self.check_extension_import(extension, env)
+                if not jit_mode:
+                    success, error_message = self.check_extension_import(extension, env)
+                else:
+                    success, error_message = self.check_jit_extension_import(extension, env)
                 #self.assertTrue(success, f"Failed to import extension: {extension}")
                 results.append((extension, success, error_message))
 

From 8118f21822511e72557e063374c40b0684710983 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Sat, 26 Jul 2025 08:41:52 +0000
Subject: [PATCH 45/79] clean up MultiTensorApply usages so that amp_C is not
 build in jit compile mode

---
 apex/amp/_process_optimizer.py                    |  4 +++-
 apex/amp/scaler.py                                |  5 ++++-
 apex/contrib/clip_grad/clip_grad.py               |  3 ++-
 apex/contrib/optimizers/distributed_fused_adam.py |  7 ++++++-
 apex/contrib/optimizers/distributed_fused_lamb.py | 10 ++++++++--
 apex/contrib/optimizers/fp16_optimizer.py         |  4 +++-
 apex/contrib/optimizers/fused_adam.py             |  4 +++-
 apex/contrib/optimizers/fused_lamb.py             |  4 +++-
 apex/contrib/optimizers/fused_sgd.py              |  6 ++++--
 apex/fp16_utils/fp16_optimizer.py                 |  4 +++-
 apex/multi_tensor_apply/__init__.py               |  6 +-----
 apex/optimizers/fused_adagrad.py                  |  4 +++-
 apex/optimizers/fused_adam.py                     |  4 +++-
 apex/optimizers/fused_lamb.py                     |  6 +++++-
 apex/optimizers/fused_lars.py                     |  4 +++-
 apex/optimizers/fused_mixed_precision_lamb.py     |  8 ++++++--
 apex/optimizers/fused_novograd.py                 |  4 +++-
 apex/optimizers/fused_sgd.py                      |  4 +++-
 apex/parallel/distributed.py                      |  4 +++-
 apex/transformer/pipeline_parallel/utils.py       |  9 +++++----
 examples/imagenet/main_amp.py                     |  1 -
 tests/L0/run_optimizers/test_lamb.py              |  4 +++-
 tests/L1/common/main_amp.py                       |  3 ++-
 23 files changed, 79 insertions(+), 33 deletions(-)

diff --git a/apex/amp/_process_optimizer.py b/apex/amp/_process_optimizer.py
index df621b87c..5f7ef4c9d 100644
--- a/apex/amp/_process_optimizer.py
+++ b/apex/amp/_process_optimizer.py
@@ -1,6 +1,6 @@
 import types
 from ..fp16_utils import master_params_to_model_params
-from ..multi_tensor_apply import multi_tensor_applier
+from ..multi_tensor_apply import MultiTensorApply
 from ._amp_state import maybe_print, _amp_state
 import torch
 from ..optimizers import FusedSGD
@@ -13,6 +13,7 @@ def __init__(self):
 
 def _master_params_to_model_params(self):
     stash = self._amp_stash
+    multi_tensor_applier = MultiTensorApply(256*32)
     if multi_tensor_applier.available:
         if len(stash.all_fp16_params) > 0:
             multi_tensor_applier(
@@ -319,6 +320,7 @@ def _amp_lazy_init(self):
 
 
 def _process_optimizer(optimizer, properties):
+    multi_tensor_applier = MultiTensorApply(256*32)
     if hasattr(optimizer, "_amp_stash"):
         raise RuntimeError("A given optimizer should only be passed through amp.initialize once.")
     else:
diff --git a/apex/amp/scaler.py b/apex/amp/scaler.py
index 75e0afbb4..33e431e71 100644
--- a/apex/amp/scaler.py
+++ b/apex/amp/scaler.py
@@ -1,5 +1,5 @@
 import torch
-from ..multi_tensor_apply import multi_tensor_applier
+from apex.multi_tensor_apply import MultiTensorApply
 from ._amp_state import _amp_state, master_params, maybe_print
 from itertools import product
 
@@ -63,6 +63,7 @@ def __init__(self,
         self._unskipped = 0
         self._has_overflow = False
         self._overflow_buf = torch.tensor([0], dtype=torch.int, device='cuda')
+        multi_tensor_applier = MultiTensorApply(256*32)
         if multi_tensor_applier.available:
             from apex.op_builder import AmpCBuilder
             amp_C = AmpCBuilder().load()
@@ -102,6 +103,7 @@ def unscale_python(self, model_grads, master_grads, scale):
 
     # unused_scale keeps some of the old API alive for hopefully a short time.
     def unscale(self, model_grads, master_grads, unused_scale, models_are_masters=False, scale_override=None):
+        multi_tensor_applier = MultiTensorApply(256*32)
         if self._has_overflow:
             return
 
@@ -164,6 +166,7 @@ def unscale_with_stashed(self,
                              stashed_master_grads,
                              master_grads,
                              scale_override=None):
+        multi_tensor_applier = MultiTensorApply(256*32)
         if self._has_overflow:
             return
 
diff --git a/apex/contrib/clip_grad/clip_grad.py b/apex/contrib/clip_grad/clip_grad.py
index 666df350e..931786206 100644
--- a/apex/contrib/clip_grad/clip_grad.py
+++ b/apex/contrib/clip_grad/clip_grad.py
@@ -6,7 +6,7 @@
 try:
     from apex.op_builder import AmpCBuilder
     amp_C = AmpCBuilder().load()
-    from apex.multi_tensor_apply import multi_tensor_applier
+    from apex.multi_tensor_apply import MultiTensorApply
     _kernel_import_succeeded = True
 except ImportError:
     _kernel_import_succeeded = False
@@ -76,6 +76,7 @@ def clip_grad_norm_(
     # Compute gradient L2 norms
     norms = []
     dummy_overflow_buf = torch.zeros([1], dtype=torch.int32, device=device)
+    multi_tensor_applier = MultiTensorApply(256*32)
     if grads_fp32:
         norms.append(
             multi_tensor_applier(
diff --git a/apex/contrib/optimizers/distributed_fused_adam.py b/apex/contrib/optimizers/distributed_fused_adam.py
index 7ff87f791..18a5779eb 100644
--- a/apex/contrib/optimizers/distributed_fused_adam.py
+++ b/apex/contrib/optimizers/distributed_fused_adam.py
@@ -27,7 +27,7 @@
 except ImportError:
     nccl_allocator = None
 
-from apex.multi_tensor_apply import multi_tensor_applier
+from apex.multi_tensor_apply import MultiTensorApply
 from apex.op_builder import AmpCBuilder, DistributedAdamBuilder, FusedAdamBuilder
 amp_C = AmpCBuilder().load()
 distributed_adam_cuda = DistributedAdamBuilder().load()
@@ -211,6 +211,7 @@ def _multi_tensor_copy(
         use_fused_kernel = use_fused_kernel and is_cuda and is_contiguous
 
         # Copy buffers
+        multi_tensor_applier = MultiTensorApply(256*32)
         if use_fused_kernel and _FOUND_DEPRECATED_FUSED_ADAM:
             if dummy_overflow_buf is None:
                 dummy_overflow_buf = torch.zeros([1], dtype=torch.int32, device="cuda")
@@ -2265,6 +2266,7 @@ def _local_grad_norm(
                     )
 
         # Compute norm of each group of grads
+        multi_tensor_applier = MultiTensorApply(256*32)
         grad_norm_sq = None
         for grad_group in grad_groups.values():
             grad_group_norm_sq = (
@@ -2663,6 +2665,7 @@ def _local_step(self, bucket_ids: List[int]) -> None:
         # Apply optimizer step to each param group
         adam_func = distributed_adam_cuda.multi_tensor_fused_adam_capturable \
             if self.capturable else distributed_adam_cuda.multi_tensor_fused_adam
+        multi_tensor_applier = MultiTensorApply(256*32)
         for (group_id, _, _, _), group_buffers in buffers.items():
             group = self.param_groups[group_id]
             beta1, beta2 = group["betas"]
@@ -2750,6 +2753,7 @@ def _local_step_with_param_remainders(
                 )
 
         # Apply optimizer step to each param group
+        multi_tensor_applier = MultiTensorApply(256*32)
         for (group_id, _), group_buffers in buffers.items():
             group = self.param_groups[group_id]
             beta1, beta2 = group["betas"]
@@ -2833,6 +2837,7 @@ def _local_step_with_scaled_states(
                 buf.mul_(scale)
 
             # Apply optimizer step to each param group
+            multi_tensor_applier = MultiTensorApply(256*32)
             for group_id, buffers in group_buffers.items():
                 group = self.param_groups[group_id]
                 beta1, beta2 = group["betas"]
diff --git a/apex/contrib/optimizers/distributed_fused_lamb.py b/apex/contrib/optimizers/distributed_fused_lamb.py
index c1a9ee12d..4df7266c1 100644
--- a/apex/contrib/optimizers/distributed_fused_lamb.py
+++ b/apex/contrib/optimizers/distributed_fused_lamb.py
@@ -5,7 +5,7 @@
 import importlib
 from apex.op_builder import AmpCBuilder, DistributedLambBuilder, FusedAdamBuilder
 amp_C = AmpCBuilder().load()
-from apex.multi_tensor_apply import multi_tensor_applier
+from apex.multi_tensor_apply import MultiTensorApply
 
 import torch.distributed.distributed_c10d as c10d
 
@@ -735,7 +735,8 @@ def _pipeline_block_reductions(self, block_id):
             else:
                 self._reduce_scatter_and_all_reduce(block_id)
 
-            # Compute L2 grad norm
+            # Compute L2 grad nor
+            multi_tensor_applier = MultiTensorApply(256*32)
             if block_id == 0:
                 with torch.cuda.stream(self._l2_grad_norm_st):
                     for block_id in range(self._num_blocks):
@@ -790,6 +791,7 @@ def _pipeline_block_reductions(self, block_id):
                         self._reductions_works[block_id][chunk_id].wait()
 
     def __compute_contrib_param_norm(self):
+        multi_tensor_applier = MultiTensorApply(256*32)
         if self._contrib_model_param_for_norm_fp16 is not None and self._contrib_model_param_for_norm_fp32 is not None:
             gnorm_fp16 = multi_tensor_applier(self.multi_tensor_l2norm, self._overflow_buf, [self._contrib_model_param_for_norm_fp16], True)[1]
             gnorm_fp32 = multi_tensor_applier(self.multi_tensor_l2norm, self._overflow_buf, [self._contrib_model_param_for_norm_fp32], True)[1]
@@ -803,6 +805,7 @@ def __compute_contrib_param_norm(self):
         return gnorm
 
     def __compute_contrib_update_norm(self):
+        multi_tensor_applier = MultiTensorApply(256*32)
         l2_norm = torch.zeros(size=[self._model_params_num], dtype=torch.float32, device='cuda')
         local_contrib_l2_norm = multi_tensor_applier(self.multi_tensor_l2norm, self._overflow_buf, [self._contrib_update_frag_for_norm], True)[1] ** 2
         l2_norm.scatter_(dim=0, index=self._offsets, src=local_contrib_l2_norm)
@@ -811,6 +814,7 @@ def __compute_contrib_update_norm(self):
         return l2_norm
 
     def _pipeline_step(self):
+        multi_tensor_applier = MultiTensorApply(256*32)
         global_scale = self.global_scale
         # if clip before ar, set max_grad_norm to 0
         max_grad_norm = self.defaults['max_grad_norm'] * self._clip_after_ar
@@ -900,6 +904,7 @@ def _pipeline_step(self):
                         )
 
     def _flatten_grad_mt(self, scale):
+        multi_tensor_applier = MultiTensorApply(256*32)
         if len(self._grads_fp16) > 0:
             self._overflow_buf.zero_()
             if not self._fused_norm:
@@ -986,6 +991,7 @@ def complete_reductions(self):
         self._grads_generated = [False]*len(self._grads_info)
 
     def step(self, closure=None, grad_scaler=None):
+        multi_tensor_applier = MultiTensorApply(256*32)
         loss = None
         if closure is not None:
             loss = closure()
diff --git a/apex/contrib/optimizers/fp16_optimizer.py b/apex/contrib/optimizers/fp16_optimizer.py
index 2171e181b..b6338c8c5 100755
--- a/apex/contrib/optimizers/fp16_optimizer.py
+++ b/apex/contrib/optimizers/fp16_optimizer.py
@@ -1,5 +1,5 @@
 import torch
-from apex.multi_tensor_apply import multi_tensor_applier
+from apex.multi_tensor_apply import MultiTensorApply
 
 class FP16_Optimizer(object):
     """
@@ -53,6 +53,7 @@ def __init__(self,
             self.fp32_groups.append(fp32_group)
             param_group['params'] = fp32_group
 
+        multi_tensor_applier = MultiTensorApply(256*32)
         if multi_tensor_applier.available:
             from apex.op_builder import AmpCBuilder
             amp_C = AmpCBuilder().load()
@@ -108,6 +109,7 @@ def step(self, closure=None):
         
         # nan check
         self.overflow_buf.zero_()
+        multi_tensor_applier = MultiTensorApply(256*32)
         for fp16_grad in fp16_grads:
             if len(fp16_grad) > 0:
                 norm, norm_per_tensor = multi_tensor_applier(self.multi_tensor_l2norm,
diff --git a/apex/contrib/optimizers/fused_adam.py b/apex/contrib/optimizers/fused_adam.py
index 5c2f73f23..379399ee2 100644
--- a/apex/contrib/optimizers/fused_adam.py
+++ b/apex/contrib/optimizers/fused_adam.py
@@ -1,7 +1,7 @@
 import types
 import torch
 import importlib
-from apex.multi_tensor_apply import multi_tensor_applier
+from apex.multi_tensor_apply import MultiTensorApply
 
 class FusedAdam(torch.optim.Optimizer):
 
@@ -46,6 +46,7 @@ def __init__(self, params,
 
         self._use_multi_tensor = False
         if use_mt:
+            multi_tensor_applier = MultiTensorApply(256*32)
             if not multi_tensor_applier.available:
                 print("Warning:  multi_tensor_applier is unavailable")
             else:
@@ -189,6 +190,7 @@ def step(self, closure=None, grads=None, output_params=None, scale=1., grad_norm
                                              group['weight_decay'])
 
             if self._use_multi_tensor:
+                multi_tensor_applier = MultiTensorApply(256*32)
                 with torch.cuda.device(tensordevice):
                     multi_tensor_applier(
                         fused_adam_cuda.adam_mt,
diff --git a/apex/contrib/optimizers/fused_lamb.py b/apex/contrib/optimizers/fused_lamb.py
index 3b3e66d7d..4ca301682 100644
--- a/apex/contrib/optimizers/fused_lamb.py
+++ b/apex/contrib/optimizers/fused_lamb.py
@@ -1,7 +1,7 @@
 import torch
 import importlib
 import math
-from apex.multi_tensor_apply import multi_tensor_applier
+from apex.multi_tensor_apply import MultiTensorApply
 
 class FusedLAMB(torch.optim.Optimizer):
 
@@ -72,6 +72,7 @@ def __init__(self, params, lr=1e-3, bias_correction=True,
                         grad_averaging=grad_averaging,
                         max_grad_norm=max_grad_norm)
         super(FusedLAMB, self).__init__(params, defaults)
+        multi_tensor_applier = MultiTensorApply(256*32)
         if multi_tensor_applier.available:
             from apex.op_builder import AmpCBuilder
             amp_C = AmpCBuilder().load()
@@ -120,6 +121,7 @@ def step(self, closure=None):
 
         g_norm_32, g_norm_16 = 0.0, 0.0
         # compute grad norm for two lists
+        multi_tensor_applier = MultiTensorApply(256*32)
         if len(g_all_32) > 0:
             g_norm_32 = multi_tensor_applier(self.multi_tensor_l2norm,
                                              self._dummy_overflow_buf,
diff --git a/apex/contrib/optimizers/fused_sgd.py b/apex/contrib/optimizers/fused_sgd.py
index 333e50288..de93e313e 100644
--- a/apex/contrib/optimizers/fused_sgd.py
+++ b/apex/contrib/optimizers/fused_sgd.py
@@ -2,7 +2,7 @@
 import torch
 from torch.optim.optimizer import Optimizer, required
 
-from apex.multi_tensor_apply import multi_tensor_applier
+from apex.multi_tensor_apply import MultiTensorApply
 
 class FusedSGD(Optimizer):
     r"""Implements stochastic gradient descent (optionally with momentum).
@@ -82,6 +82,7 @@ def __init__(self, params, lr=required, momentum=0, dampening=0,
 
         self.wd_after_momentum = wd_after_momentum
 
+        multi_tensor_applier = MultiTensorApply(256*32)
         if multi_tensor_applier.available:
             from apex.op_builder import AmpCBuilder
             amp_C = AmpCBuilder().load()
@@ -191,11 +192,12 @@ def step(self, closure=None, grads=None, output_params=None, scale=1., grad_norm
             fp16_set = [fp16_grads, fp32_from_fp16_params, fp32_from_fp16_momentums, fp16_params]
 
             launch_sets = [fp16_set, fp32_set]
+            multi_tensor_applier = MultiTensorApply(256*32)
 
             for launch_set, first_run in zip(launch_sets, first_runs):
                 assert len(launch_set[0]) == len(launch_set[1])
                 assert len(launch_set[0]) == len(launch_set[2])
-                if len(launch_set[0]) > 0:
+                if len(launch_set[0]) > 0: 
                     multi_tensor_applier(
                         self.multi_tensor_sgd,
                         self._dummy_overflow_buf,
diff --git a/apex/fp16_utils/fp16_optimizer.py b/apex/fp16_utils/fp16_optimizer.py
index feb3e3ed6..e9647e442 100755
--- a/apex/fp16_utils/fp16_optimizer.py
+++ b/apex/fp16_utils/fp16_optimizer.py
@@ -6,7 +6,7 @@
 
 from ..amp._amp_state import _amp_state, maybe_print
 from ..amp.scaler import LossScaler
-from ..multi_tensor_apply import multi_tensor_applier
+from apex.multi_tensor_apply import MultiTensorApply
 from .fp16util import model_grads_to_master_grads, master_params_to_model_params, clip_grad_norm
 
 # TODO:  Update overflow check + downscale to use Carl's fused kernel.
@@ -100,6 +100,7 @@ def __init__(self,
         self.clip_grad_norm = clip_grad_norm
 
         # TODO:  Centralize exposure and import error checking for the C backend.
+        multi_tensor_applier = MultiTensorApply(256*32)
         if multi_tensor_applier.available:
             from apex.op_builder import AmpCBuilder
             amp_C = AmpCBuilder().load()
@@ -159,6 +160,7 @@ def zero_grad(self, set_grads_to_None=False):
     #     self.loss_scaler.update_scale(has_overflow)
 
     def _master_params_to_model_params(self):
+        multi_tensor_applier = MultiTensorApply(256*32)
         if multi_tensor_applier.available:
             if len(self.all_fp16_params) > 0:
                 multi_tensor_applier(
diff --git a/apex/multi_tensor_apply/__init__.py b/apex/multi_tensor_apply/__init__.py
index 31e2a53de..88de4cdfe 100644
--- a/apex/multi_tensor_apply/__init__.py
+++ b/apex/multi_tensor_apply/__init__.py
@@ -1,5 +1 @@
-from .multi_tensor_apply import MultiTensorApply
-
-multi_tensor_applier = MultiTensorApply(256*32)
-multi_tensor_applier_l2norm = MultiTensorApply(2048*32)
-
+from .multi_tensor_apply import MultiTensorApply
\ No newline at end of file
diff --git a/apex/optimizers/fused_adagrad.py b/apex/optimizers/fused_adagrad.py
index 843841113..ed3a93690 100644
--- a/apex/optimizers/fused_adagrad.py
+++ b/apex/optimizers/fused_adagrad.py
@@ -1,5 +1,5 @@
 import torch
-from apex.multi_tensor_apply import multi_tensor_applier
+from apex.multi_tensor_apply import MultiTensorApply
 
 
 class FusedAdagrad(torch.optim.Optimizer):
@@ -48,6 +48,7 @@ def __init__(self, params, lr=1e-2, eps=1e-10,
         self.adagrad_w_mode = 1 if adagrad_w_mode else 0
         self.set_grad_none = set_grad_none
 
+        multi_tensor_applier = MultiTensorApply(256*32)
         if multi_tensor_applier.available:
             from apex.op_builder import AmpCBuilder
             amp_C = AmpCBuilder().load()
@@ -103,6 +104,7 @@ def step(self, closure=None):
                 else:
                     raise RuntimeError('FusedAdagrad only support fp16, bfloat16 and fp32.')
 
+            multi_tensor_applier = MultiTensorApply(256*32)
             if(len(g_16) > 0):
                 multi_tensor_applier(self.multi_tensor_adagrad,
                                      self._dummy_overflow_buf,
diff --git a/apex/optimizers/fused_adam.py b/apex/optimizers/fused_adam.py
index 636ba9de6..294269747 100644
--- a/apex/optimizers/fused_adam.py
+++ b/apex/optimizers/fused_adam.py
@@ -1,5 +1,5 @@
 import torch
-from apex.multi_tensor_apply import multi_tensor_applier
+from apex.multi_tensor_apply import MultiTensorApply
 
 class FusedAdam(torch.optim.Optimizer):
 
@@ -106,6 +106,7 @@ def __init__(self, params, lr=1e-3, bias_correction=True,
 
             self._step_supports_amp_scaling = True
 
+        multi_tensor_applier = MultiTensorApply(256*32)
         if multi_tensor_applier.available:
             from apex.op_builder import AmpCBuilder
             amp_C = AmpCBuilder().load()
@@ -137,6 +138,7 @@ def step(self, closure=None, grads=None, output_params=None, scale=None, grad_no
         if any(p is not None for p in [grads, output_params, scale, grad_norms]):
             raise RuntimeError('FusedAdam has been updated.  Simply initialize it identically to torch.optim.Adam, and call step() with no arguments.')
         loss = None
+        multi_tensor_applier = MultiTensorApply(256*32)
         if closure is not None:
             loss = closure()
 
diff --git a/apex/optimizers/fused_lamb.py b/apex/optimizers/fused_lamb.py
index d28b0fc7a..37b5642ec 100644
--- a/apex/optimizers/fused_lamb.py
+++ b/apex/optimizers/fused_lamb.py
@@ -1,5 +1,5 @@
 import torch
-from apex.multi_tensor_apply import multi_tensor_applier, multi_tensor_applier_l2norm
+from apex.multi_tensor_apply import MultiTensorApply
 
 class FusedLAMB(torch.optim.Optimizer):
 
@@ -72,6 +72,8 @@ def __init__(self, params, lr=1e-3, bias_correction=True,
                         grad_averaging=grad_averaging,
                         max_grad_norm=max_grad_norm)
         super(FusedLAMB, self).__init__(params, defaults)
+        multi_tensor_applier = MultiTensorApply(256*32)
+        multi_tensor_applier_l2norm = MultiTensorApply(2048*32)
         if multi_tensor_applier.available and multi_tensor_applier_l2norm.available:
             from apex.op_builder import AmpCBuilder
             amp_C = AmpCBuilder().load()
@@ -121,6 +123,8 @@ def step(self, closure=None):
         device = self.param_groups[0]["params"][0].device
         g_norm_32, g_norm_16 = torch.zeros(1, device=device), torch.zeros(1, device=device)
         # compute grad norm for two lists
+        multi_tensor_applier = MultiTensorApply(256*32)
+        multi_tensor_applier_l2norm = MultiTensorApply(2048*32)
         if len(g_all_32) > 0:
             g_norm_32 = multi_tensor_applier_l2norm(self.multi_tensor_l2norm,
                                              self._dummy_overflow_buf,
diff --git a/apex/optimizers/fused_lars.py b/apex/optimizers/fused_lars.py
index 744d558ff..de46f9203 100644
--- a/apex/optimizers/fused_lars.py
+++ b/apex/optimizers/fused_lars.py
@@ -2,7 +2,7 @@
 from torch.optim.optimizer import Optimizer, required
 from torch import nn
 from torch.nn.parameter import Parameter
-from apex.multi_tensor_apply import multi_tensor_applier
+from apex.multi_tensor_apply import MultiTensorApply
 
 class FusedLARS(Optimizer):
     def __init__(self, params, lr=required, momentum=0, dampening=0,
@@ -31,6 +31,7 @@ def __init__(self, params, lr=required, momentum=0, dampening=0,
         self.trust_coefficient = trust_coefficient
         self.eps = eps
 
+        multi_tensor_applier = MultiTensorApply(256*32)
         if multi_tensor_applier.available:
             from apex.op_builder import AmpCBuilder
             amp_C = AmpCBuilder().load()
@@ -83,6 +84,7 @@ def step(self, closure=None):
             closure (callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
+        multi_tensor_applier = MultiTensorApply(256*32)
         loss = None
         if closure is not None:
             loss = closure()
diff --git a/apex/optimizers/fused_mixed_precision_lamb.py b/apex/optimizers/fused_mixed_precision_lamb.py
index 524100cb8..a7642b1d9 100644
--- a/apex/optimizers/fused_mixed_precision_lamb.py
+++ b/apex/optimizers/fused_mixed_precision_lamb.py
@@ -3,7 +3,7 @@
 from itertools import chain
 from collections import defaultdict, abc as container_abcs
 
-from apex.multi_tensor_apply import multi_tensor_applier, multi_tensor_applier_l2norm
+from apex.multi_tensor_apply import MultiTensorApply
 
 class FusedMixedPrecisionLamb(torch.optim.Optimizer):
 
@@ -31,7 +31,9 @@ def __init__(self, params, lr=1e-3, step=0, bias_correction=True,
         for idx,group in enumerate(self.param_groups):
             for item in tensor_state:
                 self.param_groups[idx][item] = group[item].to(device=device)
-
+        
+        multi_tensor_applier = MultiTensorApply(256*32)
+        multi_tensor_applier_l2norm = MultiTensorApply(2048*32)
         if multi_tensor_applier.available and multi_tensor_applier_l2norm.available:
             from apex.op_builder import AmpCBuilder
             amp_C = AmpCBuilder().load()
@@ -181,6 +183,8 @@ def step(self, closure=None, grad_scaler=None):
         # grad_norm is of scaled gradients.
         # So, multiply `max_grad_norm` by scale.
         max_grad_norm = self.defaults['max_grad_norm'] * scale
+        multi_tensor_applier = MultiTensorApply(256*32)
+        multi_tensor_applier_l2norm = MultiTensorApply(2048*32)
         grad_norm = multi_tensor_applier_l2norm(
             self.multi_tensor_l2norm,
             self._dummy_overflow_buf,
diff --git a/apex/optimizers/fused_novograd.py b/apex/optimizers/fused_novograd.py
index 2f74f627a..d0ff7b24c 100644
--- a/apex/optimizers/fused_novograd.py
+++ b/apex/optimizers/fused_novograd.py
@@ -1,5 +1,5 @@
 import torch
-from apex.multi_tensor_apply import multi_tensor_applier
+from apex.multi_tensor_apply import MultiTensorApply
 
 class FusedNovoGrad(torch.optim.Optimizer):
 
@@ -76,6 +76,7 @@ def __init__(self, params, lr=1e-3, bias_correction=True,
                         grad_averaging=grad_averaging, norm_type=norm_type,
                         init_zero=init_zero)
         super(FusedNovoGrad, self).__init__(params, defaults)
+        multi_tensor_applier = MultiTensorApply(256*32)
         if multi_tensor_applier.available:
             from apex.op_builder import AmpCBuilder
             amp_C = AmpCBuilder().load()
@@ -113,6 +114,7 @@ def step(self, closure=None):
             closure (callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
+        multi_tensor_applier = MultiTensorApply(256*32)
         loss = None
         if closure is not None:
             loss = closure()
diff --git a/apex/optimizers/fused_sgd.py b/apex/optimizers/fused_sgd.py
index 7c6481bb7..d8eb409e6 100644
--- a/apex/optimizers/fused_sgd.py
+++ b/apex/optimizers/fused_sgd.py
@@ -1,7 +1,7 @@
 import torch
 from torch.optim.optimizer import Optimizer, required
 
-from apex.multi_tensor_apply import multi_tensor_applier
+from apex.multi_tensor_apply import MultiTensorApply
 
 class FusedSGD(Optimizer):
     r"""Implements stochastic gradient descent (optionally with momentum).
@@ -97,6 +97,7 @@ def __init__(self, params, lr=required, momentum=0, dampening=0,
         self.scale_set_by_backward = False
         self.set_grad_none = set_grad_none
 
+        multi_tensor_applier = MultiTensorApply(256*32)
         if multi_tensor_applier.available:
             from apex.op_builder import AmpCBuilder
             amp_C = AmpCBuilder().load()
@@ -143,6 +144,7 @@ def step(self, closure=None):
             closure (callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
+        multi_tensor_applier = MultiTensorApply(256*32)
         loss = None
         if closure is not None:
             loss = closure()
diff --git a/apex/parallel/distributed.py b/apex/parallel/distributed.py
index a4faadd89..15afe6fc3 100644
--- a/apex/parallel/distributed.py
+++ b/apex/parallel/distributed.py
@@ -6,7 +6,7 @@
 from itertools import chain
 import copy
 import importlib
-from ..multi_tensor_apply import multi_tensor_applier
+from apex.multi_tensor_apply import MultiTensorApply
 
 imported_flatten_impl = False
 
@@ -244,6 +244,7 @@ def __init__(self,
                                     "torch.cuda.DoubleTensor" : 2,
                                     "torch.cuda.BFloat16Tensor" : 3}
 
+        multi_tensor_applier = MultiTensorApply(256*32)
         if multi_tensor_applier.available:
             # TODO:  I really need to centralize the C++ backed imports
             from apex.op_builder import AmpCBuilder
@@ -427,6 +428,7 @@ def _event_this_bucket(self, bucket_idx):
 
     def allreduce_bucket(self, bucket, bucket_idx, force_default_stream):
         tensor = flatten(bucket)
+        multi_tensor_applier = MultiTensorApply(256*32)
 
         if force_default_stream:
             bucket_stream = self.main_stream
diff --git a/apex/transformer/pipeline_parallel/utils.py b/apex/transformer/pipeline_parallel/utils.py
index 268bcdca8..e225336f9 100644
--- a/apex/transformer/pipeline_parallel/utils.py
+++ b/apex/transformer/pipeline_parallel/utils.py
@@ -19,14 +19,11 @@
 import torch
 from torch.nn.parallel import DistributedDataParallel
 
-from apex.multi_tensor_apply import multi_tensor_applier
+from apex.multi_tensor_apply import MultiTensorApply
 from apex.transformer import parallel_state
 from apex.transformer.enums import ModelType
 from apex.transformer.microbatches import build_num_microbatches_calculator
 from apex.transformer.pipeline_parallel._timers import _Timers
-if multi_tensor_applier.available:
-    from apex.op_builder import AmpCBuilder
-    amp_C = AmpCBuilder().load()
 
 
 _GLOBAL_ARGS = None
@@ -228,6 +225,10 @@ def calc_params_l2_norm(model: torch.nn.Module, bf16: bool):
                 else:
                     params_data.append(param.data)
     # Calculate norm
+    multi_tensor_applier = MultiTensorApply(256*32)
+    if multi_tensor_applier.available:
+        from apex.op_builder import AmpCBuilder
+        amp_C = AmpCBuilder().load()
     dummy_overflow_buf = torch.cuda.IntTensor([0])
     norm, _ = multi_tensor_applier(
         amp_C.multi_tensor_l2norm, dummy_overflow_buf, [params_data], False  # no per-parameter norm
diff --git a/examples/imagenet/main_amp.py b/examples/imagenet/main_amp.py
index c4b0fdfd5..a2de55a06 100644
--- a/examples/imagenet/main_amp.py
+++ b/examples/imagenet/main_amp.py
@@ -21,7 +21,6 @@
     from apex.parallel import DistributedDataParallel as DDP
     from apex.fp16_utils import *
     from apex import amp, optimizers
-    from apex.multi_tensor_apply import multi_tensor_applier
 except ImportError:
     raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this example.")
 
diff --git a/tests/L0/run_optimizers/test_lamb.py b/tests/L0/run_optimizers/test_lamb.py
index eb7314600..d1c4f70c2 100644
--- a/tests/L0/run_optimizers/test_lamb.py
+++ b/tests/L0/run_optimizers/test_lamb.py
@@ -4,7 +4,7 @@
 import torch
 from torch.optim import Optimizer
 import apex
-from apex.multi_tensor_apply import multi_tensor_applier
+from apex.multi_tensor_apply import MultiTensorApply
 from itertools import product
 
 class RefLAMB(Optimizer):
@@ -37,6 +37,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0
             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
         super(RefLAMB, self).__init__(params, defaults)
+        multi_tensor_applier = MultiTensorApply(256*32)
         if multi_tensor_applier.available:
             from apex.op_builder import AmpCBuilder
             amp_C = AmpCBuilder().load()
@@ -73,6 +74,7 @@ def step(self, closure=None):
         device = self.param_groups[0]["params"][0].device
         g_norm_32, g_norm_16 = torch.zeros(1, device=device), torch.zeros(1, device=device)
         # compute grad norm for two lists
+        multi_tensor_applier = MultiTensorApply(256*32)
         if len(g_all_32) > 0:
             g_norm_32 = multi_tensor_applier(self.multi_tensor_l2norm,
                                              self._dummy_overflow_buf,
diff --git a/tests/L1/common/main_amp.py b/tests/L1/common/main_amp.py
index 106a0f637..93623068d 100644
--- a/tests/L1/common/main_amp.py
+++ b/tests/L1/common/main_amp.py
@@ -21,7 +21,7 @@
     from apex.parallel import DistributedDataParallel as DDP
     from apex.fp16_utils import *
     from apex import amp, optimizers
-    from apex.multi_tensor_apply import multi_tensor_applier
+    from apex.multi_tensor_apply import MultiTensorApply
 except ImportError:
     raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this example.")
 
@@ -99,6 +99,7 @@ def fast_collate(batch):
 
 # Let multi_tensor_applier be the canary in the coalmine
 # that verifies if the backend is what we think it is
+multi_tensor_applier = MultiTensorApply(256*32)
 assert multi_tensor_applier.available == args.has_ext 
 
 print("opt_level = {}".format(args.opt_level))

From bae9e719c879d25c228d186dd20c876f09ac5a54 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Mon, 28 Jul 2025 17:05:50 +0000
Subject: [PATCH 46/79] Adding missing modules from deepspeed repo. Remove
 extra code in setup.py. Use is_compatible instead of is_supported

---
 accelerator/abstract_accelerator.py |   2 +-
 accelerator/cpu_accelerator.py      |   8 +-
 accelerator/cuda_accelerator.py     |   2 +-
 accelerator/logging.py              | 151 +++++++++++++++++++++
 accelerator/numa.py                 | 202 ++++++++++++++++++++++++++++
 accelerator/real_accelerator.py     |   4 +-
 accelerator/utils.py                |  20 +++
 op_builder/all_ops.py               |   7 +-
 op_builder/builder.py               |  15 +--
 op_builder/nccl_allocator.py        |   2 +-
 setup.py                            | 124 ++---------------
 11 files changed, 404 insertions(+), 133 deletions(-)
 create mode 100644 accelerator/logging.py
 create mode 100644 accelerator/numa.py
 create mode 100644 accelerator/utils.py

diff --git a/accelerator/abstract_accelerator.py b/accelerator/abstract_accelerator.py
index ecce91c48..71b63041f 100644
--- a/accelerator/abstract_accelerator.py
+++ b/accelerator/abstract_accelerator.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
-# Taken from DeepSpeed 
+# DeepSpeed Team 
 
 import abc
 from abc import ABC
diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py
index e96b3c5d1..a14a53361 100644
--- a/accelerator/cpu_accelerator.py
+++ b/accelerator/cpu_accelerator.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
-# Taken from DeepSpeed
+# DeepSpeed Team
 
 from .abstract_accelerator import ApexAccelerator
 
@@ -72,7 +72,7 @@ def device_count(self):
         if device_count > 0:
             return device_count
         else:
-            from apex.utils.numa import get_numa_cores
+            from apex.accelerator.numa import get_numa_cores
             # Count NUMA node for number of cpu accelerators. On machine with HBM
             # In flat mode, HBM is in separate NUMA node with no cores on this node.
             # Ignore these NUMA nodes with no cores.
@@ -120,7 +120,7 @@ def Stream(self):
         return None
 
     def stream(self, stream):
-        from apex.runtime.utils import noop_context
+        from apex.accelerator.utils import noop_context
         return noop_context()
 
     def current_stream(self, device_index=None):
@@ -246,7 +246,7 @@ def create_graph(self):
         return None
 
     def capture_to_graph(self, graph, pool=None, stream=None):
-        from apex.runtime.utils import noop_context
+        from apex.accelerator.utils import noop_context
         return noop_context()
 
     def replay_graph(self, graph):
diff --git a/accelerator/cuda_accelerator.py b/accelerator/cuda_accelerator.py
index a9dac1c5e..6eb11c390 100644
--- a/accelerator/cuda_accelerator.py
+++ b/accelerator/cuda_accelerator.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
-# Taken from DeepSpeed
+# DeepSpeed Team
 
 import functools
 import os
diff --git a/accelerator/logging.py b/accelerator/logging.py
new file mode 100644
index 000000000..d93ecbb2f
--- /dev/null
+++ b/accelerator/logging.py
@@ -0,0 +1,151 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+import functools
+import logging
+import sys
+import os
+
+log_levels = {
+    "debug": logging.DEBUG,
+    "info": logging.INFO,
+    "warning": logging.WARNING,
+    "error": logging.ERROR,
+    "critical": logging.CRITICAL,
+}
+
+
+class LoggerFactory:
+
+    @staticmethod
+    def create_logger(name=None, level=logging.INFO):
+        """create a logger
+
+        Args:
+            name (str): name of the logger
+            level: level of logger
+
+        Raises:
+            ValueError is name is None
+        """
+
+        if name is None:
+            raise ValueError("name for logger cannot be None")
+
+        formatter = logging.Formatter("[%(asctime)s] [%(levelname)s] "
+                                      "[%(filename)s:%(lineno)d:%(funcName)s] %(message)s")
+
+        logger_ = logging.getLogger(name)
+        logger_.setLevel(level)
+        logger_.propagate = False
+        ch = logging.StreamHandler(stream=sys.stdout)
+        ch.setLevel(level)
+        ch.setFormatter(formatter)
+        logger_.addHandler(ch)
+        return logger_
+
+
+logger = LoggerFactory.create_logger(name="DeepSpeed", level=logging.INFO)
+
+
+@functools.lru_cache(None)
+def warning_once(*args, **kwargs):
+    """
+    This method is identical to `logger.warning()`, but will emit the warning with the same message only once
+
+    Note: The cache is for the function arguments, so 2 different callers using the same arguments will hit the cache.
+    The assumption here is that all warning messages are unique across the code. If they aren't then need to switch to
+    another type of cache that includes the caller frame information in the hashing function.
+    """
+    logger.warning(*args, **kwargs)
+
+
+logger.warning_once = warning_once
+
+
+def print_configuration(args, name):
+    logger.info("{}:".format(name))
+    for arg in sorted(vars(args)):
+        dots = "." * (29 - len(arg))
+        logger.info("  {} {} {}".format(arg, dots, getattr(args, arg)))
+
+
+def log_dist(message, ranks=None, level=logging.INFO):
+    from deepspeed import comm as dist
+    """Log message when one of following condition meets
+
+    + not dist.is_initialized()
+    + dist.get_rank() in ranks if ranks is not None or ranks = [-1]
+
+    Args:
+        message (str)
+        ranks (list)
+        level (int)
+
+    """
+    should_log = not dist.is_initialized()
+    ranks = ranks or []
+    my_rank = dist.get_rank() if dist.is_initialized() else -1
+    if ranks and not should_log:
+        should_log = ranks[0] == -1
+        should_log = should_log or (my_rank in set(ranks))
+    if should_log:
+        final_message = "[Rank {}] {}".format(my_rank, message)
+        logger.log(level, final_message)
+
+
+def print_json_dist(message, ranks=None, path=None):
+    from deepspeed import comm as dist
+    """Print message when one of following condition meets
+
+    + not dist.is_initialized()
+    + dist.get_rank() in ranks if ranks is not None or ranks = [-1]
+
+    Args:
+        message (str)
+        ranks (list)
+        path (str)
+
+    """
+    should_log = not dist.is_initialized()
+    ranks = ranks or []
+    my_rank = dist.get_rank() if dist.is_initialized() else -1
+    if ranks and not should_log:
+        should_log = ranks[0] == -1
+        should_log = should_log or (my_rank in set(ranks))
+    if should_log:
+        message['rank'] = my_rank
+        import json
+        with open(path, 'w') as outfile:
+            json.dump(message, outfile)
+            os.fsync(outfile)
+
+
+def get_current_level():
+    """
+    Return logger's current log level
+    """
+    return logger.getEffectiveLevel()
+
+
+def should_log_le(max_log_level_str):
+    """
+    Args:
+        max_log_level_str: maximum log level as a string
+
+    Returns ``True`` if the current log_level is less or equal to the specified log level. Otherwise ``False``.
+
+    Example:
+
+        ``should_log_le("info")`` will return ``True`` if the current log level is either ``logging.INFO`` or ``logging.DEBUG``
+    """
+
+    if not isinstance(max_log_level_str, str):
+        raise ValueError(f"{max_log_level_str} is not a string")
+
+    max_log_level_str = max_log_level_str.lower()
+    if max_log_level_str not in log_levels:
+        raise ValueError(f"{max_log_level_str} is not one of the `logging` levels")
+
+    return get_current_level() <= log_levels[max_log_level_str]
\ No newline at end of file
diff --git a/accelerator/numa.py b/accelerator/numa.py
new file mode 100644
index 000000000..08f277490
--- /dev/null
+++ b/accelerator/numa.py
@@ -0,0 +1,202 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+# return a list of list for cores to numa mapping
+# [
+#     [ cores for numa 0 ]
+#     [ cores belong to numa 1 ]
+#     ...
+# ]
+
+import distutils
+import os
+import psutil
+import subprocess
+
+
+# return a list of list for cores to numa mapping
+# [
+#     [ cores for numa 0 ]
+#     [ cores belong to numa 1 ]
+#     ...
+# ]
+def get_numa_cores():
+    ret = []
+    output = subprocess.check_output(['numactl', '--hardware']).decode("utf-8")
+    lines = output.split('\n')
+    for line in lines:
+        if line.startswith('available:'):
+            num_numas = int(line.split(' ')[1])
+            break
+    for numa in range(num_numas):
+        for line in lines:
+            if line.startswith(f'node {numa} cpus:'):
+                cores = line.split(' ')[3:]
+                ret.append([int(core) for core in cores])
+    return ret
+
+
+def check_for_numactl_pkg():
+    libs = dict(
+        dpkg=["-l", "numactl", "apt"],
+        pacman=["-Q", "numactl", "pacman"],
+        rpm=["-q", "numactl", "yum"],
+    )
+
+    found = False
+    for pkgmgr, data in libs.items():
+        flag, lib, tool = data
+        path = distutils.spawn.find_executable(pkgmgr)
+        if path is not None:
+            cmd = [pkgmgr, flag, lib]
+            result = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            if result.wait() == 0:
+                found = True
+            else:
+                print(f"please install the {lib} package with {tool}")
+            break
+    return found
+
+
+def parse_range(rng):
+    try:
+        value = int(rng)
+        return range(value, value + 1)
+    except ValueError:
+        # value is not a single number
+        parts = rng.split('-')
+        if len(parts) != 2:
+            raise ValueError("Bad range: '%s', range must be either a number or two number separated by dash" %
+                             (rng, ))
+        start = int(parts[0])
+        end = int(parts[1])
+        if start > end:
+            raise ValueError("Bad range: '%s', range end must larger than or equal to start" % (rng, ))
+        return range(start, end + 1)
+
+
+# parse comma and dash separated range list into list
+# i.e. "0,2-4,6" --> [0, 2, 3, 4, 6]
+# rules:
+# 1. Range list number be comma separated, each item are either a single number,
+#    or a range marked by two numbers (both number are included in the range)
+# 2. Sub ranges must be in ascend order and not overlap with each other
+# 3. No space in the range expression
+def parse_range_list(range_str):
+    number_list = []
+    last = -1
+    range_list = range_str.split(',')
+    for sub_range in range_list:
+        sub_number_list = parse_range(sub_range)
+        if sub_number_list[0] <= last:
+            raise ValueError(
+                "Bad range: '%s', sub ranges must not overlap with each other and should be in ascend order" %
+                (range_str, ))
+        last = sub_number_list[-1]
+        number_list.extend(sub_number_list)
+    return number_list
+
+
+def get_numactl_cmd(bind_core_list, num_local_procs, local_rank):
+    numactl_cmd = []
+    check_for_numactl_pkg()
+    if 'KMP_AFFINITY' in os.environ.keys():
+        raise ValueError("Environment variable KMP_AFFINITY conflicts with numactl "
+                         "because it interfere with how many CPU cores numactl can set. "
+                         "Unset KMP_AFFINITY before launching deepspeed.\n\n"
+                         "\t$ unset KMP_AFFINITY\n"
+                         "\t$ deepspeed <deepspeed command parameters>")
+    if bind_core_list is not None:
+        core_list = parse_range_list(bind_core_list)
+        total_cores = len(core_list)
+    else:
+        total_cores = psutil.cpu_count(logical=False)
+        core_list = range(total_cores)
+    cores_per_rank = total_cores // num_local_procs
+    assert cores_per_rank >= 1, "At least one core needs to be assigned to each rank"
+    core_list_for_rank = core_list[cores_per_rank * local_rank:cores_per_rank * (local_rank + 1)]
+    numactl_cmd.append("numactl")
+
+    # check if all cores belong to same numa, if true, bind process to that numa domain with -m parameter
+    numa_cores = get_numa_cores()
+    num_numas = len(numa_cores)
+
+    numa_mode = "normal"
+
+    non_empty_numa_list = []
+    empty_numa_list = []
+    previous_numa_cores = []
+    numa_node_list = []
+    numa_node_list_list = []
+    for i in range(num_numas):
+        # look for empty numa which is HBM numa
+        if numa_cores[i] == []:
+            empty_numa_list.append(i)
+        else:
+            non_empty_numa_list.append(i)
+
+            # check for fakenuma
+            if numa_cores[i] == previous_numa_cores:
+                if numa_node_list == []:
+                    #first duplication, add previous node into list
+                    numa_node_list.append(i - 1)
+                numa_node_list.append(i)
+            else:
+                if numa_node_list != []:
+                    numa_node_list_list.append(numa_node_list)
+                    numa_node_list = []
+        previous_numa_cores = numa_cores[i]
+    if numa_node_list != []:
+        numa_node_list_list.append(numa_node_list)
+
+    if empty_numa_list != [] and len(empty_numa_list) == len(non_empty_numa_list):
+        numa_mode = "flat_hbm"
+        numa_dict = dict(zip(non_empty_numa_list, empty_numa_list))
+    elif numa_node_list_list != []:
+        numa_mode = "fake"
+
+    if numa_mode == "normal":
+        for i in range(num_numas):
+            if set(core_list_for_rank) <= set(numa_cores[i]):
+                numactl_cmd.append("-m")
+                numactl_cmd.append(f"{i}")
+                break
+    elif numa_mode == "flat_hbm":
+        for i in range(num_numas):
+            if set(core_list_for_rank) <= set(numa_cores[i]):
+                numactl_cmd.append("-p")
+                numactl_cmd.append(f"{numa_dict[i]}")
+                break
+    elif numa_mode == "fake":
+        for i in range(num_numas):
+            if set(core_list_for_rank) <= set(numa_cores[i]):
+                for nodes in numa_node_list_list:
+                    if i in nodes:
+                        numactl_cmd.append("-m")
+                        numactl_cmd.append(f"{','.join(map(str, nodes))}")
+                        break
+                # the following construct break the outer loop if inner loop breaks
+                else:
+                    continue
+                break
+
+    numactl_cmd.append("-C")
+    last_core = core_list_for_rank[0]
+    first_core = last_core
+    core_list_str = f"{last_core}"
+    for core_id in core_list_for_rank[1:]:
+        if core_id == last_core + 1:
+            last_core = core_id
+            continue
+        else:
+            if first_core == last_core:
+                core_list_str = f"{core_list_str},{core_id}"
+            else:
+                core_list_str = f"{core_list_str}-{last_core},{core_id}"
+            first_core = core_id
+            last_core = core_id
+    if first_core != last_core:
+        core_list_str = f"{core_list_str}-{last_core}"
+    numactl_cmd.append(f"{core_list_str}")
+    return cores_per_rank, numactl_cmd
\ No newline at end of file
diff --git a/accelerator/real_accelerator.py b/accelerator/real_accelerator.py
index 59cdc68a7..e4092a5e8 100644
--- a/accelerator/real_accelerator.py
+++ b/accelerator/real_accelerator.py
@@ -1,13 +1,13 @@
 # Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
-# Taken from DeepSpeed
+# DeepSpeed Team
 import os
 
 try:
     # Importing logger currently requires that torch is installed, hence the try...except
     # TODO: Remove logger dependency on torch.
-    from apex.utils import logger as accel_logger
+    from apex.accelerator.logging import logger as accel_logger
 except ImportError as e:
     accel_logger = None
 
diff --git a/accelerator/utils.py b/accelerator/utils.py
new file mode 100644
index 000000000..bb6ec2d7c
--- /dev/null
+++ b/accelerator/utils.py
@@ -0,0 +1,20 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Copyright NVIDIA/Megatron
+
+Helper functions and classes from multiple sources.
+"""
+
+class noop_context(object):
+
+    def __init__(self):
+        pass
+
+    def __enter__(self):
+        pass
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        pass
\ No newline at end of file
diff --git a/op_builder/all_ops.py b/op_builder/all_ops.py
index 7c12e50d3..7de5ae8e6 100644
--- a/op_builder/all_ops.py
+++ b/op_builder/all_ops.py
@@ -1,3 +1,8 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import os
 import pkgutil
 import importlib
@@ -9,7 +14,7 @@
 
 # List of all available ops
 
-# reflect all builder names into __op_builders__
+# append all builder names into __op_builders__
 op_builder_dir = get_accelerator().op_builder_dir()
 op_builder_module = importlib.import_module(op_builder_dir)
 __op_builders__ = []
diff --git a/op_builder/builder.py b/op_builder/builder.py
index 8611e252c..4ee01f095 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
-# Taken from DeepSpeed
+# DeepSpeed Team
 
 import os
 import re
@@ -310,12 +310,6 @@ def is_compatible(self, verbose=False):
         '''
         return True
 
-    def is_supported(self):
-        '''
-        Check if all conditions are satisfied to build this op
-        '''
-        return True
-
     def extra_ldflags(self):
         return []
 
@@ -695,7 +689,12 @@ def version_dependent_macros(self):
         version_ge_1_5 = []
         if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
             version_ge_1_5 = ['-DVERSION_GE_1_5']
-        return version_ge_1_1 + version_ge_1_3 + version_ge_1_5
+        
+        version_dependent_macro_args = version_ge_1_1 + version_ge_1_3 + version_ge_1_5
+        if self.is_rocm_pytorch() and (self.torch_version()[0] >= 6):
+            version_dependent_macro_args += ["-DHIPBLAS_V2"] 
+
+        return version_dependent_macro_args
 
     def is_compatible(self, verbose=False):
         return super().is_compatible(verbose)
diff --git a/op_builder/nccl_allocator.py b/op_builder/nccl_allocator.py
index 0cbf27634..ca1636c94 100644
--- a/op_builder/nccl_allocator.py
+++ b/op_builder/nccl_allocator.py
@@ -27,7 +27,7 @@ def cxx_args(self):
     def nvcc_args(self):
         return self.nccl_args()
 
-    def is_supported(self):
+    def is_compatible(self, verbose=False):
         torch_version = self.torch_version()
         if torch_version >= (2, 6):
             available_nccl_version = self.nccl_version()
diff --git a/setup.py b/setup.py
index e3167b31f..5192fea80 100644
--- a/setup.py
+++ b/setup.py
@@ -26,37 +26,9 @@
 from op_builder.all_ops import ALL_OPS, accelerator_name
 from op_builder.builder import installed_cuda_version
 
-from accelerator import get_accelerator
-
 # ninja build does not work unless include_dirs are abs path
 this_dir = os.path.dirname(os.path.abspath(__file__))
-torch_dir = torch.__path__[0]
-
-
-
-# https://github.com/pytorch/pytorch/pull/71881
-# For the extensions which have rocblas_gemm_flags_fp16_alt_impl we need to make sure if at::BackwardPassGuard exists.
-# It helps the extensions be backward compatible with old PyTorch versions.
-# The check and ROCM_BACKWARD_PASS_GUARD in nvcc/hipcc args can be retired once the PR is merged into PyTorch upstream.
-
-context_file = os.path.join(torch_dir, "include", "ATen", "Context.h")
-if os.path.exists(context_file):
-    lines = open(context_file, 'r').readlines()
-    found_Backward_Pass_Guard = False
-    found_ROCmBackward_Pass_Guard = False
-    for line in lines:
-        if "BackwardPassGuard" in line:
-            # BackwardPassGuard has been renamed to ROCmBackwardPassGuard
-            # https://github.com/pytorch/pytorch/pull/71881/commits/4b82f5a67a35406ffb5691c69e6b4c9086316a43
-            if "ROCmBackwardPassGuard" in line:
-                found_ROCmBackward_Pass_Guard = True
-            else:
-                found_Backward_Pass_Guard = True
-            break
-
-found_aten_atomic_header = False
-if os.path.exists(os.path.join(torch_dir, "include", "ATen", "Atomic.cuh")):
-    found_aten_atomic_header = True
+
 
 def get_cuda_bare_metal_version(cuda_dir):
     raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
@@ -76,50 +48,6 @@ def get_rocm_bare_metal_version(rocm_dir):
     bare_metal_minor = release[1][0]
     return raw_output, bare_metal_major, bare_metal_minor
 
-def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
-    raw_output, bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(cuda_dir)
-    torch_binary_major = torch.version.cuda.split(".")[0]
-    torch_binary_minor = torch.version.cuda.split(".")[1]
-
-    print("\nCompiling cuda extensions with")
-    print(raw_output + "from " + cuda_dir + "/bin\n")
-
-    if (bare_metal_major != torch_binary_major) or (bare_metal_minor != torch_binary_minor):
-        raise RuntimeError(
-            "Cuda extensions are being compiled with a version of Cuda that does "
-            "not match the version used to compile Pytorch binaries.  "
-            "Pytorch binaries were compiled with Cuda {}.\n".format(torch.version.cuda)
-            + "In some cases, a minor-version mismatch will not cause later errors:  "
-            "https://github.com/NVIDIA/apex/pull/323#discussion_r287021798.  "
-            "You can try commenting out this check (at your own risk)."
-        )
-
-def check_rocm_torch_binary_vs_bare_metal(rocm_dir):
-    raw_output, bare_metal_major, bare_metal_minor = get_rocm_bare_metal_version(rocm_dir)
-    torch_binary_major = torch.version.hip.split(".")[0]
-    torch_binary_minor = torch.version.hip.split(".")[1]
-
-    print("\nCompiling rocm extensions with")
-    print(raw_output + "from " + rocm_dir + "/bin\n")
-
-    if (bare_metal_major != torch_binary_major) or (bare_metal_minor != torch_binary_minor):
-        raise RuntimeError(
-            "Cuda extensions are being compiled with a version of Cuda that does "
-            "not match the version used to compile Pytorch binaries.  "
-            "Pytorch binaries were compiled with Cuda {}.\n".format(torch.version.cuda)
-            + "In some cases, a minor-version mismatch will not cause later errors:  "
-            "https://github.com/NVIDIA/apex/pull/323#discussion_r287021798.  "
-            "You can try commenting out this check (at your own risk)."
-        )
-
-def raise_if_home_none(global_option: str) -> None:
-    if CUDA_HOME is not None or ROCM_HOME is not None:
-        return
-    raise RuntimeError(
-        f"{global_option} was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  "
-        "If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, "
-        "only images whose names contain 'devel' will provide nvcc."
-    )
 
 def get_apex_version():
     cwd = os.path.dirname(os.path.abspath(__file__))
@@ -137,23 +65,6 @@ def get_apex_version():
             apex_version += ".git"+os.getenv("APEX_COMMIT")[:8]
     return apex_version
 
-def append_nvcc_threads(nvcc_extra_args):
-    _, bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(CUDA_HOME)
-    if int(bare_metal_major) >= 11 and int(bare_metal_minor) >= 2:
-        return nvcc_extra_args + ["--threads", "4"]
-    return nvcc_extra_args
-
-
-def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int) -> bool:
-    cudnn_available = torch.backends.cudnn.is_available()
-    cudnn_version = torch.backends.cudnn.version() if cudnn_available else None
-    if not (cudnn_available and (cudnn_version >= required_cudnn_version)):
-        warnings.warn(
-            f"Skip `{global_option}` as it requires cuDNN {required_cudnn_version} or later, "
-            f"but {'cuDNN is not available' if not cudnn_available else cudnn_version}"
-        )
-        return False
-    return True
 
 print("\n\ntorch.__version__  = {}\n\n".format(torch.__version__))
 TORCH_MAJOR = int(torch.__version__.split('.')[0])
@@ -214,13 +125,6 @@ def check_if_rocm_pytorch():
     _, bare_metal_version, bare_metal_minor  = get_rocm_bare_metal_version(ROCM_HOME)
 
 
-if "--cpp_ext" in sys.argv or "--cuda_ext" in sys.argv:
-    if TORCH_MAJOR == 0:
-        raise RuntimeError("--cpp_ext requires Pytorch 1.0 or later, "
-                           "found torch.__version__ = {}".format(torch.__version__)
-                           )
-
-
 # ***************************** Op builder **********************
 
 def get_env_if_set(key, default: typing.Any = ""):
@@ -250,9 +154,15 @@ def command_exists(cmd):
 build_flags = {
     "APEX_BUILD_OPS" : BUILD_OP_DEFAULT,
     "APEX_CPP_OPS" : BUILD_CPP_OP,
-    "APEX_OPS" : BUILD_CUDA_OP,
+    "APEX_CUDA_OPS" : BUILD_CUDA_OP,
     }
 
+if BUILD_CPP_OP or BUILD_CUDA_OP:
+    if TORCH_MAJOR == 0:
+        raise RuntimeError("--cpp_ext requires Pytorch 1.0 or later, "
+                           "found torch.__version__ = {}".format(torch.__version__)
+                           )
+
 ext_modules = []
 
 def is_env_set(key):
@@ -282,7 +192,6 @@ def is_op_included(op_name):
 for op_name, builder in ALL_OPS.items():
     op_compatible = builder.is_compatible()
     op_included = is_op_included(op_name)
-    op_supported = builder.is_supported()
 
     # If op is requested but not available, throw an error.
     if op_enabled(op_name) and not op_compatible:
@@ -297,12 +206,6 @@ def is_op_included(op_name):
         del install_ops[op_name]
         continue
 
-    #check if the conditions for building the module are satisfied
-    if not op_supported:
-        builder.warning(f"Skipping unsupported {op_name}; The conditions for building this module are not satisfied.")
-        del install_ops[op_name]
-        continue
-
     # If op is compatible but install is not enabled (JIT mode).
     if IS_ROCM_PYTORCH and op_compatible and not op_enabled(op_name):
         builder.hipify_extension()
@@ -313,15 +216,7 @@ def is_op_included(op_name):
         install_ops[op_name] = op_enabled(op_name)
         ext_modules.append(builder.builder())
 
-print(f'Install Ops={install_ops}')
-    
-if "--cuda_ext" in sys.argv:
-    raise_if_home_none("--cuda_ext")
-    
-    if not IS_ROCM_PYTORCH:
-        check_cuda_torch_binary_vs_bare_metal(CUDA_HOME)
-    else:
-        check_rocm_torch_binary_vs_bare_metal(ROCM_HOME)
+print(f'Install Ops={install_ops}')  
 
 # Write out version/git info.
 git_hash_cmd = shlex.split("bash -c \"git rev-parse --short HEAD\"")
@@ -397,7 +292,6 @@ def is_op_included(op_name):
     fd.write(f"torch_info={torch_info}\n")
 
 
-
 with open('requirements.txt') as f:
     required = f.read().splitlines()
 

From 2b86e01ab82926662b873f7cb369c035203b983f Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Mon, 28 Jul 2025 17:14:19 +0000
Subject: [PATCH 47/79] change name of apex_C module

---
 op_builder/apex_C.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/op_builder/apex_C.py b/op_builder/apex_C.py
index 2d458602e..50a196ee0 100644
--- a/op_builder/apex_C.py
+++ b/op_builder/apex_C.py
@@ -4,7 +4,7 @@
 
 
 class ApexCBuilder(TorchCPUOpBuilder):
-    BUILD_VAR = 'APEX_BUILD_C'
+    BUILD_VAR = 'APEX_BUILD_APEX_C'
     INCLUDE_FLAG = "APEX_CPP_OPS"
     NAME = "apex_C"
 

From b1b439d09eeec5a99abd97ce3f3b6c5d400ed47f Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Tue, 29 Jul 2025 16:10:50 +0000
Subject: [PATCH 48/79] change the name of cpp and cuda build flags, remove
 APEX_BUILD_OPS, cleanup the logic to build specific modules

---
 op_builder/amp_C.py                           |  2 +-
 op_builder/apex_C.py                          |  2 +-
 op_builder/bnp.py                             |  2 +-
 op_builder/distributed_adam.py                |  2 +-
 op_builder/distributed_lamb.py                |  2 +-
 op_builder/fast_multihead_attn.py             |  2 +-
 op_builder/focal_loss.py                      |  2 +-
 op_builder/fused_adam.py                      |  2 +-
 op_builder/fused_bias_swiglu.py               |  2 +-
 op_builder/fused_dense.py                     |  2 +-
 op_builder/fused_index_mul_2d.py              |  2 +-
 op_builder/fused_lamb.py                      |  2 +-
 op_builder/fused_layer_norm.py                |  2 +-
 op_builder/fused_rope.py                      |  2 +-
 op_builder/fused_weight_gradient_mlp.py       |  2 +-
 .../generic_scaled_masked_softmax_cuda.py     |  2 +-
 op_builder/mlp.py                             |  2 +-
 op_builder/nccl_allocator.py                  |  2 +-
 op_builder/nccl_p2p.py                        |  2 +-
 op_builder/peer_memory.py                     |  2 +-
 op_builder/scaled_masked_softmax_cuda.py      |  2 +-
 op_builder/scaled_softmax_cuda.py             |  2 +-
 ...scaled_upper_triang_masked_softmax_cuda.py |  2 +-
 op_builder/syncbn.py                          |  2 +-
 op_builder/transducer_joint.py                |  2 +-
 op_builder/transducer_loss.py                 |  2 +-
 op_builder/xentropy.py                        |  2 +-
 setup.py                                      | 34 +++++++------------
 28 files changed, 39 insertions(+), 49 deletions(-)

diff --git a/op_builder/amp_C.py b/op_builder/amp_C.py
index 284d2a808..41f029fcb 100644
--- a/op_builder/amp_C.py
+++ b/op_builder/amp_C.py
@@ -5,7 +5,7 @@
 
 class AmpCBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_AMP_C'
-    INCLUDE_FLAG = "APEX_CUDA_OPS"
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
     NAME = "amp_C"
 
     def __init__(self):
diff --git a/op_builder/apex_C.py b/op_builder/apex_C.py
index 50a196ee0..2098b3b88 100644
--- a/op_builder/apex_C.py
+++ b/op_builder/apex_C.py
@@ -5,7 +5,7 @@
 
 class ApexCBuilder(TorchCPUOpBuilder):
     BUILD_VAR = 'APEX_BUILD_APEX_C'
-    INCLUDE_FLAG = "APEX_CPP_OPS"
+    INCLUDE_FLAG = "APEX_BUILD_CPP_OPS"
     NAME = "apex_C"
 
     def __init__(self):
diff --git a/op_builder/bnp.py b/op_builder/bnp.py
index e15db8aa3..f7fbe1abd 100644
--- a/op_builder/bnp.py
+++ b/op_builder/bnp.py
@@ -5,7 +5,7 @@
 
 class BnpBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_BNP'
-    INCLUDE_FLAG = "APEX_CUDA_OPS"
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
     NAME = "bnp"
 
     def __init__(self):
diff --git a/op_builder/distributed_adam.py b/op_builder/distributed_adam.py
index 2d142b397..ef453bee9 100644
--- a/op_builder/distributed_adam.py
+++ b/op_builder/distributed_adam.py
@@ -5,7 +5,7 @@
 
 class DistributedAdamBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_DISTRIBUTED_ADAM'
-    INCLUDE_FLAG = "APEX_CUDA_OPS"
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
     NAME = "distributed_adam_cuda"
 
     def __init__(self):
diff --git a/op_builder/distributed_lamb.py b/op_builder/distributed_lamb.py
index 53b8a35ee..74d77d129 100644
--- a/op_builder/distributed_lamb.py
+++ b/op_builder/distributed_lamb.py
@@ -5,7 +5,7 @@
 
 class DistributedLambBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_DISTRIBUTED_LAMB'
-    INCLUDE_FLAG = "APEX_CUDA_OPS"
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
     NAME = "distributed_lamb_cuda"
 
     def __init__(self):
diff --git a/op_builder/fast_multihead_attn.py b/op_builder/fast_multihead_attn.py
index 37a0eb09b..0f2f8b52f 100644
--- a/op_builder/fast_multihead_attn.py
+++ b/op_builder/fast_multihead_attn.py
@@ -5,7 +5,7 @@
 
 class FastMultiheadAttnBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_FAST_MULTIHEAD_ATTN'
-    INCLUDE_FLAG = "APEX_CUDA_OPS"
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
     NAME = "fast_multihead_attn"
 
     def __init__(self):
diff --git a/op_builder/focal_loss.py b/op_builder/focal_loss.py
index 14d0f89f1..98a21330a 100644
--- a/op_builder/focal_loss.py
+++ b/op_builder/focal_loss.py
@@ -5,7 +5,7 @@
 
 class FocalLossBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_FOCAL_LOSS'
-    INCLUDE_FLAG = "APEX_CUDA_OPS"
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
     NAME = "focal_loss_cuda"
 
     def __init__(self):
diff --git a/op_builder/fused_adam.py b/op_builder/fused_adam.py
index c352cf026..f335368d8 100644
--- a/op_builder/fused_adam.py
+++ b/op_builder/fused_adam.py
@@ -5,7 +5,7 @@
 
 class FusedAdamBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_FUSED_ADAM'
-    INCLUDE_FLAG = "APEX_CUDA_OPS"
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
     NAME = "fused_adam_cuda"
 
     def __init__(self):
diff --git a/op_builder/fused_bias_swiglu.py b/op_builder/fused_bias_swiglu.py
index 77c1c58c6..4a7d13881 100644
--- a/op_builder/fused_bias_swiglu.py
+++ b/op_builder/fused_bias_swiglu.py
@@ -4,7 +4,7 @@
 
 class FusedBiasSwiGLUBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_FUSED_BIAS_SWIGLU'
-    INCLUDE_FLAG = "APEX_CUDA_OPS"
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
     NAME = "fused_bias_swiglu"
 
     def __init__(self):
diff --git a/op_builder/fused_dense.py b/op_builder/fused_dense.py
index 3df1dfcac..4d40eef6d 100644
--- a/op_builder/fused_dense.py
+++ b/op_builder/fused_dense.py
@@ -5,7 +5,7 @@
 
 class FusedDenseBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_FUSED_DENSE'
-    INCLUDE_FLAG = "APEX_CUDA_OPS"
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
     NAME = "fused_dense_cuda"
 
     def __init__(self):
diff --git a/op_builder/fused_index_mul_2d.py b/op_builder/fused_index_mul_2d.py
index 081061be4..d04564e15 100644
--- a/op_builder/fused_index_mul_2d.py
+++ b/op_builder/fused_index_mul_2d.py
@@ -5,7 +5,7 @@
 
 class FusedIndexMul2dBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_FUSED_INDEX_MUL_2D'
-    INCLUDE_FLAG = "APEX_CUDA_OPS"
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
     NAME = "fused_index_mul_2d"
 
     def __init__(self):
diff --git a/op_builder/fused_lamb.py b/op_builder/fused_lamb.py
index 3f4dc027f..02a0b6fe7 100644
--- a/op_builder/fused_lamb.py
+++ b/op_builder/fused_lamb.py
@@ -5,7 +5,7 @@
 
 class FusedLambBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_FUSED_LAMB'
-    INCLUDE_FLAG = "APEX_CUDA_OPS"
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
     NAME = "fused_lamb_cuda"
 
     def __init__(self):
diff --git a/op_builder/fused_layer_norm.py b/op_builder/fused_layer_norm.py
index f56c930aa..66130f17b 100644
--- a/op_builder/fused_layer_norm.py
+++ b/op_builder/fused_layer_norm.py
@@ -5,7 +5,7 @@
 
 class FusedLayerNormBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_FUSED_LAYER_NORM'
-    INCLUDE_FLAG = "APEX_CUDA_OPS"
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
     NAME = "fused_layer_norm_cuda"
 
     def __init__(self):
diff --git a/op_builder/fused_rope.py b/op_builder/fused_rope.py
index 4bfe3b632..c87f14b84 100644
--- a/op_builder/fused_rope.py
+++ b/op_builder/fused_rope.py
@@ -5,7 +5,7 @@
 
 class FusedRopeBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_FUSED_ROPE'
-    INCLUDE_FLAG = "APEX_CUDA_OPS"
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
     NAME = "fused_rotary_positional_embedding"
 
     def __init__(self):
diff --git a/op_builder/fused_weight_gradient_mlp.py b/op_builder/fused_weight_gradient_mlp.py
index 510a33046..b6d595385 100644
--- a/op_builder/fused_weight_gradient_mlp.py
+++ b/op_builder/fused_weight_gradient_mlp.py
@@ -2,7 +2,7 @@
 
 class FusedWeightGradientMlpCudaBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_FUSED_WEIGHT_GRADIENT_MLP'
-    INCLUDE_FLAG = "APEX_CUDA_OPS"
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
     NAME = "fused_weight_gradient_mlp_cuda"
 
     def __init__(self):
diff --git a/op_builder/generic_scaled_masked_softmax_cuda.py b/op_builder/generic_scaled_masked_softmax_cuda.py
index c263c2d46..a0fb2d5fc 100644
--- a/op_builder/generic_scaled_masked_softmax_cuda.py
+++ b/op_builder/generic_scaled_masked_softmax_cuda.py
@@ -2,7 +2,7 @@
 
 class GenericScaledMaskedSoftmaxCudaBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_GENERIC_SCALED_MASKED_SOFTMAX_CUDA'
-    INCLUDE_FLAG = "APEX_CUDA_OPS"
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
     NAME = "generic_scaled_masked_softmax_cuda"
 
     def __init__(self):
diff --git a/op_builder/mlp.py b/op_builder/mlp.py
index fb42e7d3d..c6a177721 100644
--- a/op_builder/mlp.py
+++ b/op_builder/mlp.py
@@ -5,7 +5,7 @@
 
 class MlpBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_MLP'
-    INCLUDE_FLAG = "APEX_CUDA_OPS"
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
     NAME = "mlp_cuda"
 
     def __init__(self):
diff --git a/op_builder/nccl_allocator.py b/op_builder/nccl_allocator.py
index ca1636c94..320e76476 100644
--- a/op_builder/nccl_allocator.py
+++ b/op_builder/nccl_allocator.py
@@ -5,7 +5,7 @@
 
 class NCCLAllocatorBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_NCCL_ALLOCATOR'
-    INCLUDE_FLAG = "APEX_CUDA_OPS"
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
     NAME = "_apex_nccl_allocator"
 
     def __init__(self):
diff --git a/op_builder/nccl_p2p.py b/op_builder/nccl_p2p.py
index ee9ed2bb5..37772572e 100644
--- a/op_builder/nccl_p2p.py
+++ b/op_builder/nccl_p2p.py
@@ -5,7 +5,7 @@
 
 class NCCLP2PBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_NCCL_P2P'
-    INCLUDE_FLAG = "APEX_CUDA_OPS"
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
     NAME = "nccl_p2p_cuda"
 
     def __init__(self):
diff --git a/op_builder/peer_memory.py b/op_builder/peer_memory.py
index 07e712339..c869f0be6 100644
--- a/op_builder/peer_memory.py
+++ b/op_builder/peer_memory.py
@@ -5,7 +5,7 @@
 
 class PeerMemoryBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_PEER_MEMORY'
-    INCLUDE_FLAG = "APEX_CUDA_OPS"
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
     NAME = "peer_memory_cuda"
 
     def __init__(self):
diff --git a/op_builder/scaled_masked_softmax_cuda.py b/op_builder/scaled_masked_softmax_cuda.py
index 0e5e0bbcf..1013ef8d2 100644
--- a/op_builder/scaled_masked_softmax_cuda.py
+++ b/op_builder/scaled_masked_softmax_cuda.py
@@ -2,7 +2,7 @@
 
 class ScaledMaskedSoftmaxCudaBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_SCALED_MASKED_SOFTMAX_CUDA'
-    INCLUDE_FLAG = "APEX_CUDA_OPS"
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
     NAME = "scaled_masked_softmax_cuda"
 
     def __init__(self):
diff --git a/op_builder/scaled_softmax_cuda.py b/op_builder/scaled_softmax_cuda.py
index 89eb4fb08..f29543963 100644
--- a/op_builder/scaled_softmax_cuda.py
+++ b/op_builder/scaled_softmax_cuda.py
@@ -4,7 +4,7 @@
 
 class ScaledSoftmaxCudaBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_SCALED_SOFTMAX_CUDA'
-    INCLUDE_FLAG = "APEX_CUDA_OPS"
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
     NAME = "scaled_softmax_cuda"
 
     def __init__(self):
diff --git a/op_builder/scaled_upper_triang_masked_softmax_cuda.py b/op_builder/scaled_upper_triang_masked_softmax_cuda.py
index 4b9d6797c..3c2273ad9 100644
--- a/op_builder/scaled_upper_triang_masked_softmax_cuda.py
+++ b/op_builder/scaled_upper_triang_masked_softmax_cuda.py
@@ -2,7 +2,7 @@
 
 class ScaledUpperTriangMaskedSoftmaxCudaBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_SCALED_UPPER_TRIANG_MASKED_SOFTMAX_CUDA'
-    INCLUDE_FLAG = "APEX_CUDA_OPS"
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
     NAME = "scaled_upper_triang_masked_softmax_cuda"
 
     def __init__(self):
diff --git a/op_builder/syncbn.py b/op_builder/syncbn.py
index 2dca996a8..251c33e01 100644
--- a/op_builder/syncbn.py
+++ b/op_builder/syncbn.py
@@ -5,7 +5,7 @@
 
 class SyncBnBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_SYNCBN'
-    INCLUDE_FLAG = "APEX_CUDA_OPS"
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
     NAME = "syncbn"
 
     def __init__(self):
diff --git a/op_builder/transducer_joint.py b/op_builder/transducer_joint.py
index 248f5c985..c17f60f7b 100644
--- a/op_builder/transducer_joint.py
+++ b/op_builder/transducer_joint.py
@@ -4,7 +4,7 @@
 
 class TransducerJointBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_TRANSDUCER_JOINT'
-    INCLUDE_FLAG = "APEX_CUDA_OPS"
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
     NAME = "transducer_joint_cuda"
 
     def __init__(self):
diff --git a/op_builder/transducer_loss.py b/op_builder/transducer_loss.py
index da3e5c461..53ae4eaac 100644
--- a/op_builder/transducer_loss.py
+++ b/op_builder/transducer_loss.py
@@ -4,7 +4,7 @@
 
 class TransducerLossBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_TRANSDUCER_LOSS'
-    INCLUDE_FLAG = "APEX_CUDA_OPS"
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
     NAME = "transducer_loss_cuda"
 
     def __init__(self):
diff --git a/op_builder/xentropy.py b/op_builder/xentropy.py
index 107b2412c..84f3ddf12 100644
--- a/op_builder/xentropy.py
+++ b/op_builder/xentropy.py
@@ -5,7 +5,7 @@
 
 class XentropyBuilder(CUDAOpBuilder):
     BUILD_VAR = 'APEX_BUILD_XENTROPY'
-    INCLUDE_FLAG = "APEX_CUDA_OPS"
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
     NAME = "xentropy_cuda"
 
     def __init__(self):
diff --git a/setup.py b/setup.py
index 5192fea80..ea25cada5 100644
--- a/setup.py
+++ b/setup.py
@@ -145,19 +145,15 @@ def command_exists(cmd):
         result = subprocess.Popen(safe_cmd, stdout=subprocess.PIPE)
         return result.wait() == 0
 
-
-BUILD_OP_PLATFORM = 1 if sys.platform == "win32" else 0
-BUILD_OP_DEFAULT = int(get_env_if_set('APEX_BUILD_OPS', BUILD_OP_PLATFORM))
-print(f"APEX_BUILD_OPS={BUILD_OP_DEFAULT}")
-BUILD_CPP_OP = int(get_env_if_set('APEX_CPP_OPS', BUILD_OP_PLATFORM))
-BUILD_CUDA_OP = int(get_env_if_set('APEX_CUDA_OPS', BUILD_OP_PLATFORM))
+BUILD_OP_DEFAULT = 0
+BUILD_CPP_OPS = int(get_env_if_set('APEX_BUILD_CPP_OPS', BUILD_OP_DEFAULT))
+BUILD_CUDA_OPS = int(get_env_if_set('APEX_BUILD_CUDA_OPS', BUILD_OP_DEFAULT))
 build_flags = {
-    "APEX_BUILD_OPS" : BUILD_OP_DEFAULT,
-    "APEX_CPP_OPS" : BUILD_CPP_OP,
-    "APEX_CUDA_OPS" : BUILD_CUDA_OP,
+    "APEX_BUILD_CPP_OPS" : BUILD_CPP_OPS,
+    "APEX_BUILD_CUDA_OPS" : BUILD_CUDA_OPS,
     }
 
-if BUILD_CPP_OP or BUILD_CUDA_OP:
+if BUILD_CPP_OPS or BUILD_CUDA_OPS:
     if TORCH_MAJOR == 0:
         raise RuntimeError("--cpp_ext requires Pytorch 1.0 or later, "
                            "found torch.__version__ = {}".format(torch.__version__)
@@ -191,29 +187,23 @@ def is_op_included(op_name):
 install_ops = dict.fromkeys(ALL_OPS.keys(), False)
 for op_name, builder in ALL_OPS.items():
     op_compatible = builder.is_compatible()
-    op_included = is_op_included(op_name)
+    enabled = op_enabled(op_name) or is_op_included(op_name)
 
     # If op is requested but not available, throw an error.
-    if op_enabled(op_name) and not op_compatible:
-        env_var = op_envvar(op_name)
+    if enabled and not op_compatible:
+        builder.warning(f"Skip pre-compile of incompatible {op_name}; One can disable {op_name} with {env_var}=0")
         if not is_env_set(env_var):
             builder.warning(f"Skip pre-compile of incompatible {op_name}; One can disable {op_name} with {env_var}=0")
         continue
 
-    #if the necessary build flags for the op is not provided, then skip building it
-    if not op_included:
-        builder.warning(f"Skipping unsupported {op_name}; Build flags for {op_name}: {ALL_OPS[op_name].INCLUDE_FLAG} not set")
-        del install_ops[op_name]
-        continue
-
     # If op is compatible but install is not enabled (JIT mode).
-    if IS_ROCM_PYTORCH and op_compatible and not op_enabled(op_name):
+    if IS_ROCM_PYTORCH and op_compatible and not enabled:
         builder.hipify_extension()
 
     # If op install enabled, add builder to extensions.
     # Also check if corresponding flags are checked
-    if op_enabled(op_name) and op_compatible:
-        install_ops[op_name] = op_enabled(op_name)
+    if enabled and op_compatible:
+        install_ops[op_name] = True
         ext_modules.append(builder.builder())
 
 print(f'Install Ops={install_ops}')  

From ab7fbd7dc47e7d576833c6523175f83b9b3fc302 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Wed, 30 Jul 2025 06:53:44 +0000
Subject: [PATCH 49/79] add missing files used in cpu accelerator

---
 accelerator/cpu_accelerator.py | 21 +++++-------------
 op_builder/__init__.py         |  4 ++--
 op_builder/cpu/__init__.py     |  1 +
 op_builder/cpu/builder.py      | 40 ++++++++++++++++++++++++++++++++++
 op_builder/cpu/no_impl.py      | 24 ++++++++++++++++++++
 5 files changed, 72 insertions(+), 18 deletions(-)
 create mode 100644 op_builder/cpu/__init__.py
 create mode 100644 op_builder/cpu/builder.py
 create mode 100644 op_builder/cpu/no_impl.py

diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py
index a14a53361..0dc376601 100644
--- a/accelerator/cpu_accelerator.py
+++ b/accelerator/cpu_accelerator.py
@@ -316,23 +316,12 @@ def get_op_builder(self, class_name):
             # is op_builder from apex or a 3p version? this should only succeed if it's apex
             # if successful this also means we're doing a local install and not JIT compile path
             from op_builder import __apex__  # noqa: F401 # type: ignore
-            from op_builder.cpu import AsyncIOBuilder, CCLCommBuilder, ShareMemCommBuilder, FusedAdamBuilder, CPUAdamBuilder, NotImplementedBuilder
+            from op_builder.cpu import NotImplementedBuilder
         except ImportError:
-            from apex.ops.op_builder.cpu import AsyncIOBuilder, CCLCommBuilder, ShareMemCommBuilder, FusedAdamBuilder, CPUAdamBuilder, NotImplementedBuilder
-
-        if class_name == "CCLCommBuilder":
-            return CCLCommBuilder
-        elif class_name == "ShareMemCommBuilder":
-            return ShareMemCommBuilder
-        elif class_name == "FusedAdamBuilder":
-            return FusedAdamBuilder
-        elif class_name == "CPUAdamBuilder":
-            return CPUAdamBuilder
-        elif class_name == "AsyncIOBuilder":
-            return AsyncIOBuilder
-        else:
-            # return a NotImplementedBuilder to avoid get NoneType[Name] in unit tests
-            return NotImplementedBuilder
+            from apex.op_builder.cpu import NotImplementedBuilder
+
+        # return a NotImplementedBuilder to avoid get NoneType[Name] in unit tests
+        return NotImplementedBuilder
 
     def build_extension(self):
         from torch.utils.cpp_extension import BuildExtension
diff --git a/op_builder/__init__.py b/op_builder/__init__.py
index 9ca584f37..cc7ea9765 100644
--- a/op_builder/__init__.py
+++ b/op_builder/__init__.py
@@ -10,10 +10,10 @@
 
 from .builder import get_default_compute_capabilities, OpBuilder
 
-# Do not remove, required for abstract accelerator to detect if we have a deepspeed or 3p op_builder
+# Do not remove, required for abstract accelerator to detect if we have a apex or 3p op_builder
 __apex__ = True
 
-# List of all available op builders from deepspeed op_builder
+# List of all available op builders from apex op_builder
 try:
     import apex.op_builder  # noqa: F401 # type: ignore
     op_builder_dir = "apex.op_builder"
diff --git a/op_builder/cpu/__init__.py b/op_builder/cpu/__init__.py
new file mode 100644
index 000000000..e89339d38
--- /dev/null
+++ b/op_builder/cpu/__init__.py
@@ -0,0 +1 @@
+from .no_impl import NotImplementedBuilder
\ No newline at end of file
diff --git a/op_builder/cpu/builder.py b/op_builder/cpu/builder.py
new file mode 100644
index 000000000..89e015fb7
--- /dev/null
+++ b/op_builder/cpu/builder.py
@@ -0,0 +1,40 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import os
+
+try:
+    # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+    # if successful this also means we're doing a local install and not JIT compile path
+    from op_builder import __deepspeed__  # noqa: F401 # type: ignore
+    from op_builder.builder import OpBuilder
+except ImportError:
+    from deepspeed.ops.op_builder.builder import OpBuilder
+
+
+class CPUOpBuilder(OpBuilder):
+
+    def builder(self):
+        from torch.utils.cpp_extension import CppExtension as ExtensionBuilder
+        include_dirs = [os.path.abspath(x) for x in self.strip_empty_entries(self.include_paths())]
+        compile_args = {'cxx': self.strip_empty_entries(self.cxx_args())}
+
+        cpp_ext = ExtensionBuilder(name=self.absolute_name(),
+                                   sources=self.strip_empty_entries(self.sources()),
+                                   include_dirs=include_dirs,
+                                   libraries=self.strip_empty_entries(self.libraries_args()),
+                                   extra_compile_args=compile_args)
+
+        return cpp_ext
+
+    def cxx_args(self):
+        args = ['-O3', '-g', '-Wno-reorder']
+        CPU_ARCH = self.cpu_arch()
+        SIMD_WIDTH = self.simd_width()
+        args += [CPU_ARCH, '-fopenmp', SIMD_WIDTH]
+        return args
+
+    def libraries_args(self):
+        return []
\ No newline at end of file
diff --git a/op_builder/cpu/no_impl.py b/op_builder/cpu/no_impl.py
new file mode 100644
index 000000000..23c2cf6d5
--- /dev/null
+++ b/op_builder/cpu/no_impl.py
@@ -0,0 +1,24 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import CPUOpBuilder
+
+
+class NotImplementedBuilder(CPUOpBuilder):
+    BUILD_VAR = "DS_BUILD_NOT_IMPLEMENTED"
+    NAME = "deepspeed_not_implemented"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.comm.{self.NAME}_op'
+
+    def load(self, verbose=True):
+        raise ValueError("This op had not been implemented on CPU backend.")
+
+    def sources(self):
+        return []
\ No newline at end of file

From 178c5fcf981f6ac5c5eec16cbb2c3cf626107ea6 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Wed, 30 Jul 2025 08:00:37 +0000
Subject: [PATCH 50/79] add make clean command to handle deleting torch
 extensions installed for jit modules, fix the cpu builder import error

---
 Makefile                  | 12 +++++++++---
 op_builder/cpu/builder.py |  6 +++---
 scripts/clean.py          | 16 ++++++++++++++++
 3 files changed, 28 insertions(+), 6 deletions(-)
 create mode 100644 scripts/clean.py

diff --git a/Makefile b/Makefile
index 9caf57155..99e44805f 100644
--- a/Makefile
+++ b/Makefile
@@ -2,9 +2,15 @@ PYTHON = python3
 PIP = $(PYTHON) -m pip
 
 clean: # This will remove ALL build folders.
-	@rm -r build/
-	@rm -r dist/
-	@rm -r *.egg-info
+	@test -d build/ && echo "Deleting build folder" || true
+	@test -d build/ && rm -r build/ || true
+	@test -d dist/ && echo "Deleting dist folder" || true
+	@test -d dist/ && rm -r dist/ || true
+	@test -d apex.egg-info/ && echo "Deleting apex.egg-info folder" || true
+	@test -d apex.egg-info/ && rm -r apex.egg-info/ || true
+
+	$(PYTHON) scripts/clean.py # remove the apex extensions installed at torch extensions folder 
+
 aiter:
 	$(PIP) uninstall -y aiter
 	cd third_party/aiter && $(PIP) install . --no-build-isolation --no-deps
diff --git a/op_builder/cpu/builder.py b/op_builder/cpu/builder.py
index 89e015fb7..eebe3c92e 100644
--- a/op_builder/cpu/builder.py
+++ b/op_builder/cpu/builder.py
@@ -6,12 +6,12 @@
 import os
 
 try:
-    # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+    # is op_builder from apex or a 3p version? this should only succeed if it's apex
     # if successful this also means we're doing a local install and not JIT compile path
-    from op_builder import __deepspeed__  # noqa: F401 # type: ignore
+    from op_builder import __apex__  # noqa: F401 # type: ignore
     from op_builder.builder import OpBuilder
 except ImportError:
-    from deepspeed.ops.op_builder.builder import OpBuilder
+    from apex.op_builder.builder import OpBuilder
 
 
 class CPUOpBuilder(OpBuilder):
diff --git a/scripts/clean.py b/scripts/clean.py
new file mode 100644
index 000000000..be7e69798
--- /dev/null
+++ b/scripts/clean.py
@@ -0,0 +1,16 @@
+import torch.utils.cpp_extension
+import shutil
+import os
+import sys
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from op_builder.all_ops import ALL_OPS
+
+torch_ext_directory = torch.utils.cpp_extension._get_build_directory("", False)
+
+install_ops = dict.fromkeys(ALL_OPS.keys(), False)
+for op_name, builder in ALL_OPS.items():
+    path = os.path.join(torch_ext_directory, op_name)
+    if os.path.exists(path):
+        print ("removing torch extension", op_name, "at", torch_ext_directory)
+        shutil.rmtree(path)
\ No newline at end of file

From 4f61ab3f7b557ab4e7f2b7ce7835e08e2013dcda Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Tue, 5 Aug 2025 10:15:43 +0000
Subject: [PATCH 51/79] remove unused code in setup.py, fix the code to build
 for cpu mode

---
 accelerator/cpu_accelerator.py | 62 ++++++++++++++++++++++++----------
 accelerator/logging.py         |  2 +-
 op_builder/all_ops.py          |  1 +
 op_builder/cpu/no_impl.py      |  6 ++--
 setup.py                       | 13 +++----
 5 files changed, 57 insertions(+), 27 deletions(-)

diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py
index 0dc376601..07af0dd54 100644
--- a/accelerator/cpu_accelerator.py
+++ b/accelerator/cpu_accelerator.py
@@ -4,6 +4,8 @@
 # DeepSpeed Team
 
 from .abstract_accelerator import ApexAccelerator
+import importlib
+import pkgutil
 
 # During setup stage torch may not be installed, pass on no torch will
 # allow op builder related API to be executed.
@@ -292,9 +294,9 @@ def op_builder_dir(self):
             # is op_builder from apex or a 3p version? this should only succeed if it's apex
             # if successful this also means we're doing a local install and not JIT compile path
             from op_builder import __apex__  # noqa: F401 # type: ignore
-            return "op_builder.cpu"
+            return "op_builder"
         except ImportError:
-            return "apex.op_builder.cpu"
+            return "apex.op_builder"
 
     def on_accelerator(self, tensor):
         device_str = str(tensor.device)
@@ -304,24 +306,50 @@ def on_accelerator(self, tensor):
             return False
 
     # create an instance of op builder and return, name specified by class_name
-    def create_op_builder(self, op_name):
-        builder_class = self.get_op_builder(op_name)
-        if builder_class is not None:
-            return builder_class()
-        return None
+    def create_op_builder(self, class_name):
+        self._lazy_init_class_dict()
+        if class_name in self.class_dict:
+            return self.class_dict[class_name]()
+        else:
+            return None
+
+
+    # dict that holds class name <--> class type mapping i.e.
+    # 'AsyncIOBuilder': <class 'op_builder.async_io.AsyncIOBuilder'>
+    # this dict will be filled at init stage
+    class_dict = None
+
+    def _lazy_init_class_dict(self):
+        if self.class_dict is not None:
+            return
+        else:
+            self.class_dict = {}
+            # begin initialize for create_op_builder()
+            # put all valid class name <--> class type mapping into class_dict
+            op_builder_dir = self.op_builder_dir()
+            op_builder_module = importlib.import_module(op_builder_dir)
+            op_builder_absolute_path = os.path.dirname(op_builder_module.__file__)
+            for _, module_name, _ in pkgutil.iter_modules([op_builder_absolute_path]):
+                # avoid self references,
+                # skip sub_directories which contains ops for other backend(cpu, npu, etc.).
+                if module_name != 'all_ops' and module_name != 'builder' and not os.path.isdir(
+                        os.path.join(op_builder_absolute_path, module_name)):
+                    module = importlib.import_module("{}.{}".format(op_builder_dir, module_name))
+                    for member_name in module.__dir__():
+                        if member_name.endswith(
+                                'Builder'
+                        ) and member_name != "OpBuilder" and member_name != "CUDAOpBuilder" and member_name != "TorchCPUOpBuilder":  # avoid abstract classes
+                            if not member_name in self.class_dict:
+                                self.class_dict[member_name] = getattr(module, member_name)
+            # end initialize for create_op_builder()
 
     # return an op builder class, name specified by class_name
     def get_op_builder(self, class_name):
-        try:
-            # is op_builder from apex or a 3p version? this should only succeed if it's apex
-            # if successful this also means we're doing a local install and not JIT compile path
-            from op_builder import __apex__  # noqa: F401 # type: ignore
-            from op_builder.cpu import NotImplementedBuilder
-        except ImportError:
-            from apex.op_builder.cpu import NotImplementedBuilder
-
-        # return a NotImplementedBuilder to avoid get NoneType[Name] in unit tests
-        return NotImplementedBuilder
+        self._lazy_init_class_dict()
+        if class_name in self.class_dict:
+            return self.class_dict[class_name]
+        else:
+            return None
 
     def build_extension(self):
         from torch.utils.cpp_extension import BuildExtension
diff --git a/accelerator/logging.py b/accelerator/logging.py
index d93ecbb2f..d2074cc1a 100644
--- a/accelerator/logging.py
+++ b/accelerator/logging.py
@@ -46,7 +46,7 @@ def create_logger(name=None, level=logging.INFO):
         return logger_
 
 
-logger = LoggerFactory.create_logger(name="DeepSpeed", level=logging.INFO)
+logger = LoggerFactory.create_logger(name="apex", level=logging.INFO)
 
 
 @functools.lru_cache(None)
diff --git a/op_builder/all_ops.py b/op_builder/all_ops.py
index 7de5ae8e6..fa4943cb2 100644
--- a/op_builder/all_ops.py
+++ b/op_builder/all_ops.py
@@ -27,6 +27,7 @@
             if member_name.endswith('Builder'):
                 # append builder to __op_builders__ list
                 builder = get_accelerator().create_op_builder(member_name)
+                print ("--", member_name, type(builder))
                 __op_builders__.append(builder)
                 
 ALL_OPS = {op.name: op for op in __op_builders__ if op is not None}
diff --git a/op_builder/cpu/no_impl.py b/op_builder/cpu/no_impl.py
index 23c2cf6d5..9b2856c85 100644
--- a/op_builder/cpu/no_impl.py
+++ b/op_builder/cpu/no_impl.py
@@ -7,15 +7,15 @@
 
 
 class NotImplementedBuilder(CPUOpBuilder):
-    BUILD_VAR = "DS_BUILD_NOT_IMPLEMENTED"
-    NAME = "deepspeed_not_implemented"
+    BUILD_VAR = "APEX_BUILD_NOT_IMPLEMENTED"
+    NAME = "apex_not_implemented"
 
     def __init__(self, name=None):
         name = self.NAME if name is None else name
         super().__init__(name=name)
 
     def absolute_name(self):
-        return f'deepspeed.ops.comm.{self.NAME}_op'
+        return f'apex.{self.NAME}_op'
 
     def load(self, verbose=True):
         raise ValueError("This op had not been implemented on CPU backend.")
diff --git a/setup.py b/setup.py
index ea25cada5..0353d77c2 100644
--- a/setup.py
+++ b/setup.py
@@ -22,9 +22,7 @@
 
 sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
 
-from op_builder import get_default_compute_capabilities, OpBuilder
 from op_builder.all_ops import ALL_OPS, accelerator_name
-from op_builder.builder import installed_cuda_version
 
 # ninja build does not work unless include_dirs are abs path
 this_dir = os.path.dirname(os.path.abspath(__file__))
@@ -115,8 +113,6 @@ def check_if_rocm_pytorch():
     )
 
 # cmdclass = {}
-ext_modules = []
-
 extras = {}
 
 if not IS_ROCM_PYTORCH:
@@ -159,8 +155,6 @@ def command_exists(cmd):
                            "found torch.__version__ = {}".format(torch.__version__)
                            )
 
-ext_modules = []
-
 def is_env_set(key):
     """
     Checks if an environment variable is set and not "".
@@ -184,7 +178,9 @@ def is_op_included(op_name):
     include_flag = ALL_OPS[op_name].INCLUDE_FLAG
     return get_env_if_set(include_flag, False)
 
+ext_modules = []
 install_ops = dict.fromkeys(ALL_OPS.keys(), False)
+
 for op_name, builder in ALL_OPS.items():
     op_compatible = builder.is_compatible()
     enabled = op_enabled(op_name) or is_op_included(op_name)
@@ -281,6 +277,11 @@ def is_op_included(op_name):
     fd.write(f"accelerator_name='{accelerator_name}'\n")
     fd.write(f"torch_info={torch_info}\n")
 
+if "--cpp_ext" in sys.argv:
+    sys.argv.remove("--cpp_ext")
+
+if "--cuda_ext" in sys.argv:
+    sys.argv.remove("--cuda_ext")
 
 with open('requirements.txt') as f:
     required = f.read().splitlines()

From 56950daa0a14714429f09b64d2cbf31ee67594ae Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Tue, 5 Aug 2025 10:54:22 +0000
Subject: [PATCH 52/79] Removing unused code

---
 op_builder/all_ops.py      |  1 -
 op_builder/cpu/__init__.py |  1 -
 op_builder/cpu/builder.py  | 40 --------------------------------------
 op_builder/cpu/no_impl.py  | 24 -----------------------
 4 files changed, 66 deletions(-)
 delete mode 100644 op_builder/cpu/__init__.py
 delete mode 100644 op_builder/cpu/builder.py
 delete mode 100644 op_builder/cpu/no_impl.py

diff --git a/op_builder/all_ops.py b/op_builder/all_ops.py
index fa4943cb2..7de5ae8e6 100644
--- a/op_builder/all_ops.py
+++ b/op_builder/all_ops.py
@@ -27,7 +27,6 @@
             if member_name.endswith('Builder'):
                 # append builder to __op_builders__ list
                 builder = get_accelerator().create_op_builder(member_name)
-                print ("--", member_name, type(builder))
                 __op_builders__.append(builder)
                 
 ALL_OPS = {op.name: op for op in __op_builders__ if op is not None}
diff --git a/op_builder/cpu/__init__.py b/op_builder/cpu/__init__.py
deleted file mode 100644
index e89339d38..000000000
--- a/op_builder/cpu/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .no_impl import NotImplementedBuilder
\ No newline at end of file
diff --git a/op_builder/cpu/builder.py b/op_builder/cpu/builder.py
deleted file mode 100644
index eebe3c92e..000000000
--- a/op_builder/cpu/builder.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-
-import os
-
-try:
-    # is op_builder from apex or a 3p version? this should only succeed if it's apex
-    # if successful this also means we're doing a local install and not JIT compile path
-    from op_builder import __apex__  # noqa: F401 # type: ignore
-    from op_builder.builder import OpBuilder
-except ImportError:
-    from apex.op_builder.builder import OpBuilder
-
-
-class CPUOpBuilder(OpBuilder):
-
-    def builder(self):
-        from torch.utils.cpp_extension import CppExtension as ExtensionBuilder
-        include_dirs = [os.path.abspath(x) for x in self.strip_empty_entries(self.include_paths())]
-        compile_args = {'cxx': self.strip_empty_entries(self.cxx_args())}
-
-        cpp_ext = ExtensionBuilder(name=self.absolute_name(),
-                                   sources=self.strip_empty_entries(self.sources()),
-                                   include_dirs=include_dirs,
-                                   libraries=self.strip_empty_entries(self.libraries_args()),
-                                   extra_compile_args=compile_args)
-
-        return cpp_ext
-
-    def cxx_args(self):
-        args = ['-O3', '-g', '-Wno-reorder']
-        CPU_ARCH = self.cpu_arch()
-        SIMD_WIDTH = self.simd_width()
-        args += [CPU_ARCH, '-fopenmp', SIMD_WIDTH]
-        return args
-
-    def libraries_args(self):
-        return []
\ No newline at end of file
diff --git a/op_builder/cpu/no_impl.py b/op_builder/cpu/no_impl.py
deleted file mode 100644
index 9b2856c85..000000000
--- a/op_builder/cpu/no_impl.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-
-from .builder import CPUOpBuilder
-
-
-class NotImplementedBuilder(CPUOpBuilder):
-    BUILD_VAR = "APEX_BUILD_NOT_IMPLEMENTED"
-    NAME = "apex_not_implemented"
-
-    def __init__(self, name=None):
-        name = self.NAME if name is None else name
-        super().__init__(name=name)
-
-    def absolute_name(self):
-        return f'apex.{self.NAME}_op'
-
-    def load(self, verbose=True):
-        raise ValueError("This op had not been implemented on CPU backend.")
-
-    def sources(self):
-        return []
\ No newline at end of file

From 327c8cf9adb41f5c542c0ff5d665b4d64413efd5 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Tue, 5 Aug 2025 21:51:22 +0000
Subject: [PATCH 53/79] remove accelerator package and refactor the used code
 into op_builder.all_ops BuilderUtils class

---
 accelerator/__init__.py             |   7 -
 accelerator/abstract_accelerator.py | 306 ----------------------
 accelerator/cpu_accelerator.py      | 378 ---------------------------
 accelerator/cuda_accelerator.py     | 385 ----------------------------
 accelerator/logging.py              | 151 -----------
 accelerator/numa.py                 | 202 ---------------
 accelerator/real_accelerator.py     | 126 ---------
 accelerator/utils.py                |  20 --
 apex/git_version_info.py            |   1 -
 op_builder/__init__.py              |   8 +-
 op_builder/all_ops.py               |  68 ++++-
 op_builder/builder.py               |   5 +-
 setup.py                            |   3 +-
 13 files changed, 66 insertions(+), 1594 deletions(-)
 delete mode 100644 accelerator/__init__.py
 delete mode 100644 accelerator/abstract_accelerator.py
 delete mode 100644 accelerator/cpu_accelerator.py
 delete mode 100644 accelerator/cuda_accelerator.py
 delete mode 100644 accelerator/logging.py
 delete mode 100644 accelerator/numa.py
 delete mode 100644 accelerator/real_accelerator.py
 delete mode 100644 accelerator/utils.py

diff --git a/accelerator/__init__.py b/accelerator/__init__.py
deleted file mode 100644
index e145afb03..000000000
--- a/accelerator/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-
-from .abstract_accelerator import ApexAccelerator
-from .real_accelerator import get_accelerator, set_accelerator, is_current_accelerator_supported
\ No newline at end of file
diff --git a/accelerator/abstract_accelerator.py b/accelerator/abstract_accelerator.py
deleted file mode 100644
index 71b63041f..000000000
--- a/accelerator/abstract_accelerator.py
+++ /dev/null
@@ -1,306 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team 
-
-import abc
-from abc import ABC
-
-
-class ApexAccelerator(ABC):
-
-    def __init__(self):
-        self._name = None
-        self._communication_backend_name = None
-        self._compile_backend = None
-
-    @abc.abstractmethod
-    def is_synchronized_device(self):
-        ...
-
-    @abc.abstractmethod
-    def use_host_timers(self):
-        ...
-
-    @abc.abstractmethod
-    def resolves_data_dependency(self):
-        ...
-
-    @abc.abstractmethod
-    def handles_memory_backpressure(self):
-        ...
-
-    # Device APIs
-    @abc.abstractmethod
-    def device_name(self, device_index):
-        ...
-
-    @abc.abstractmethod
-    def device(self, device_index):
-        ...
-
-    @abc.abstractmethod
-    def set_device(self, device_index):
-        ...
-
-    @abc.abstractmethod
-    def current_device(self):
-        ...
-
-    @abc.abstractmethod
-    def current_device_name(self):
-        ...
-
-    @abc.abstractmethod
-    def device_count(self):
-        ...
-
-    @abc.abstractmethod
-    def synchronize(self, device_index=None):
-        ...
-
-    # RNG APIs
-    @abc.abstractmethod
-    def random(self):
-        ...
-
-    @abc.abstractmethod
-    def set_rng_state(self, new_state, device_index=None):
-        ...
-
-    @abc.abstractmethod
-    def get_rng_state(self, device_index=None):
-        ...
-
-    @abc.abstractmethod
-    def manual_seed(self, seed):
-        ...
-
-    @abc.abstractmethod
-    def manual_seed_all(self, seed):
-        ...
-
-    @abc.abstractmethod
-    def initial_seed(self):
-        ...
-
-    @abc.abstractmethod
-    def default_generator(self, device_index):
-        ...
-
-    # Streams/Events
-    @property
-    @abc.abstractmethod
-    def Stream(self):
-        ...
-
-    @abc.abstractmethod
-    def stream(self, stream):
-        ...
-
-    @abc.abstractmethod
-    def current_stream(self, device_index=None):
-        ...
-
-    @abc.abstractmethod
-    def default_stream(self, device_index=None):
-        ...
-
-    @property
-    @abc.abstractmethod
-    def Event(self):
-        ...
-
-    # Memory management
-    @abc.abstractmethod
-    def empty_cache(self):
-        ...
-
-    @abc.abstractmethod
-    def memory_allocated(self, device_index=None):
-        ...
-
-    @abc.abstractmethod
-    def max_memory_allocated(self, device_index=None):
-        ...
-
-    @abc.abstractmethod
-    def reset_max_memory_allocated(self, device_index=None):
-        ...
-
-    @abc.abstractmethod
-    def memory_cached(self, device_index=None):
-        ...
-
-    @abc.abstractmethod
-    def max_memory_cached(self, device_index=None):
-        ...
-
-    @abc.abstractmethod
-    def reset_max_memory_cached(self, device_index=None):
-        ...
-
-    @abc.abstractmethod
-    def memory_stats(self, device_index=None):
-        ...
-
-    @abc.abstractmethod
-    def reset_peak_memory_stats(self, device_index=None):
-        ...
-
-    @abc.abstractmethod
-    def memory_reserved(self, device_index=None):
-        ...
-
-    @abc.abstractmethod
-    def max_memory_reserved(self, device_index=None):
-        ...
-
-    @abc.abstractmethod
-    def total_memory(self, device_index=None):
-        ...
-
-    @abc.abstractmethod
-    def available_memory(self, device_index=None):
-        ...
-
-    # Data types
-    @abc.abstractmethod
-    def is_bf16_supported(self):
-        ...
-
-    @abc.abstractmethod
-    def is_fp16_supported(self):
-        ...
-
-    @abc.abstractmethod
-    def supported_dtypes(self):
-        ...
-
-    # Misc
-    @abc.abstractmethod
-    def amp(self):
-        ...
-
-    @abc.abstractmethod
-    def is_available(self):
-        ...
-
-    @abc.abstractmethod
-    def range_push(self, msg):
-        ...
-
-    @abc.abstractmethod
-    def range_pop(self):
-        ...
-
-    @abc.abstractmethod
-    def lazy_call(self, callback):
-        ...
-
-    @abc.abstractmethod
-    def communication_backend_name(self):
-        ...
-
-    @abc.abstractmethod
-    def is_triton_supported(self):
-        ...
-
-    # Graph operations
-    @abc.abstractmethod
-    def create_graph(self):
-        ...
-
-    @abc.abstractmethod
-    def capture_to_graph(self, graph, pool=None, stream=None):
-        ...
-
-    @abc.abstractmethod
-    def replay_graph(self, graph):
-        ...
-
-    # Tensor operations
-    @property
-    @abc.abstractmethod
-    def BFloat16Tensor(self):
-        ...
-
-    @property
-    @abc.abstractmethod
-    def ByteTensor(self):
-        ...
-
-    @property
-    @abc.abstractmethod
-    def DoubleTensor(self):
-        ...
-
-    @property
-    @abc.abstractmethod
-    def FloatTensor(self):
-        ...
-
-    @property
-    @abc.abstractmethod
-    def HalfTensor(self):
-        ...
-
-    @property
-    @abc.abstractmethod
-    def IntTensor(self):
-        ...
-
-    @property
-    @abc.abstractmethod
-    def LongTensor(self):
-        ...
-
-    @abc.abstractmethod
-    def pin_memory(self, tensor, align_bytes=1):
-        ...
-
-    @abc.abstractmethod
-    def is_pinned(self, tensor):
-        ...
-
-    @abc.abstractmethod
-    def on_accelerator(self, tensor):
-        ...
-
-    @abc.abstractmethod
-    def op_builder_dir(self):
-        ...
-
-    # create an instance of op builder, specified by class_name
-    @abc.abstractmethod
-    def create_op_builder(self, class_name):
-        ...
-
-    # return an op builder class, specified by class_name
-    @abc.abstractmethod
-    def get_op_builder(self, class_name):
-        ...
-
-    @abc.abstractmethod
-    def build_extension(self):
-        ...
-
-    @abc.abstractmethod
-    def export_envs(self):
-        ...
-
-    @abc.abstractmethod
-    def visible_devices_envs(self):
-        ...
-
-    @abc.abstractmethod
-    def set_visible_devices_envs(self, current_env, local_accelerator_ids):
-        ...
-
-    @abc.abstractmethod
-    def get_compile_backend(self):
-        ...
-
-    @abc.abstractmethod
-    def set_compile_backend(self, backend):
-        ...
\ No newline at end of file
diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py
deleted file mode 100644
index 07af0dd54..000000000
--- a/accelerator/cpu_accelerator.py
+++ /dev/null
@@ -1,378 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-
-from .abstract_accelerator import ApexAccelerator
-import importlib
-import pkgutil
-
-# During setup stage torch may not be installed, pass on no torch will
-# allow op builder related API to be executed.
-try:
-    import torch
-except ImportError as e:
-    pass
-
-try:
-    import oneccl_bindings_for_pytorch  # noqa: F401 # type: ignore
-    oneccl_imported_p = True
-except ImportError as e:
-    oneccl_imported_p = False
-
-import os
-
-
-# accelerator for Intel CPU
-class CPU_Accelerator(ApexAccelerator):
-
-    def __init__(self):
-        self._name = 'cpu'
-        self._compile_backend = "inductor"
-        if oneccl_imported_p:
-            self._communication_backend_name = 'ccl'
-        else:
-            # fallback to gloo if oneccl_binding_for_pytorch is not installed
-            self._communication_backend_name = 'gloo'
-        try:
-            import psutil
-            mem = psutil.Process().memory_info().rss
-            self.max_mem = mem
-        except ImportError as e:
-            self.max_mem = 0
-
-    def is_synchronized_device(self):
-        return True
-
-    def use_host_timers(self):
-        return self.is_synchronized_device()
-
-    def resolves_data_dependency(self):
-        return self.is_synchronized_device()
-
-    def handles_memory_backpressure(self):
-        return self.is_synchronized_device()
-
-    # Device APIs
-    def device_name(self, device_index=None):
-        return 'cpu'
-
-    def device(self, device_index=None):
-        return None
-
-    def set_device(self, device_index):
-        return
-
-    def current_device(self):
-        return os.environ.get('LOCAL_RANK', 0)
-
-    def current_device_name(self):
-        return 'cpu'
-
-    def device_count(self):
-        device_count = int(os.environ.get('LOCAL_SIZE', 0))
-        if device_count > 0:
-            return device_count
-        else:
-            from apex.accelerator.numa import get_numa_cores
-            # Count NUMA node for number of cpu accelerators. On machine with HBM
-            # In flat mode, HBM is in separate NUMA node with no cores on this node.
-            # Ignore these NUMA nodes with no cores.
-            numa_core_lists = get_numa_cores()
-            if not numa_core_lists:
-                return 1
-            numa_count = 0
-            prev_core_list = []
-            for core_list in numa_core_lists:
-                if len(core_list) > 0 and core_list != prev_core_list:
-                    numa_count += 1
-                    prev_core_list = core_list
-            return numa_count
-
-    def synchronize(self, device_index=None):
-        return
-
-    # RNG APIs
-    def random(self):
-        return torch.random
-
-    def set_rng_state(self, new_state, device_index=None):
-        if device_index is None:
-            return torch.set_rng_state(new_state)
-        return torch.set_rng_state(new_state, device_index)
-
-    def get_rng_state(self, device_index=None):
-        return torch.get_rng_state()
-
-    def manual_seed(self, seed):
-        return torch.manual_seed(seed)
-
-    def manual_seed_all(self, seed):
-        return torch.manual_seed(seed)
-
-    def initial_seed(self):
-        return torch.initial_seed()
-
-    def default_generator(self, device_index):
-        return torch.default_generator
-
-    # Streams/Events
-    @property
-    def Stream(self):
-        return None
-
-    def stream(self, stream):
-        from apex.accelerator.utils import noop_context
-        return noop_context()
-
-    def current_stream(self, device_index=None):
-        return None
-
-    def default_stream(self, device_index=None):
-        return None
-
-    @property
-    def Event(self):
-        return None
-
-    # Memory management
-    def empty_cache(self):
-        return
-
-    def get_rss(self):
-        import psutil
-        mem = psutil.Process().memory_info().rss
-        if mem > self.max_mem:
-            self.max_mem = mem
-        return mem
-
-    def reset_rss(self):
-        import psutil
-        mem = psutil.Process().memory_info().rss
-        self.max_mem = mem
-        return mem
-
-    def memory_allocated(self, device_index=None):
-        return self.get_rss()
-
-    def max_memory_allocated(self, device_index=None):
-        self.get_rss()
-        return self.max_mem
-
-    def reset_max_memory_allocated(self, device_index=None):
-        self.reset_rss()
-        return
-
-    def memory_cached(self, device_index=None):
-        return self.get_rss()
-
-    def max_memory_cached(self, device_index=None):
-        self.get_rss()
-        return self.max_mem
-
-    def reset_max_memory_cached(self, device_index=None):
-        self.reset_rss()
-        return
-
-    def memory_stats(self, device_index=None):
-        mem = self.get_rss()
-        mem_stat = {}
-        mem_stat['allocated_bytes.all.current'] = mem
-        mem_stat['allocated_bytes.all.peak'] = self.max_mem
-        return mem_stat
-
-    def reset_peak_memory_stats(self, device_index=None):
-        self.reset_rss()
-        return
-
-    def memory_reserved(self, device_index=None):
-        return self.get_rss()
-
-    def max_memory_reserved(self, device_index=None):
-        self.get_rss()
-        return self.max_mem
-
-    def total_memory(self, device_index=None):
-        import psutil
-        return psutil.virtual_memory().total
-
-    def available_memory(self, device_index=None):
-        import psutil
-        return psutil.virtual_memory().available
-
-    # Misc
-    def amp(self):
-        return torch.cpu.amp
-
-    def is_available(self):
-        return True
-
-    def range_push(self, msg):
-        # TODO itt is currently not supported yet
-        # return torch.profiler.itt.range_push(msg)
-        return
-
-    def range_pop(self):
-        # TODO itt is currently not supported yet
-        # return torch.profiler.itt.range_pop()
-        return
-
-    def lazy_call(self, callback):
-        return callback()
-
-    def communication_backend_name(self):
-        return self._communication_backend_name
-
-    def is_triton_supported(self):
-        return False
-
-    # Data types
-    def is_bf16_supported(self):
-        return True
-
-    def is_fp16_supported(self):
-        try:
-            if torch.ops.mkldnn._is_mkldnn_fp16_supported():
-                return True
-        except:
-            return False
-
-    def supported_dtypes(self):
-        supported_dtypes = [torch.float, torch.bfloat16]
-        if self.is_fp16_supported():
-            supported_dtypes.append(torch.float16)
-        return supported_dtypes
-
-    # Graph operations
-    def create_graph(self):
-        return None
-
-    def capture_to_graph(self, graph, pool=None, stream=None):
-        from apex.accelerator.utils import noop_context
-        return noop_context()
-
-    def replay_graph(self, graph):
-        return
-
-    # Tensor operations
-    @property
-    def BFloat16Tensor(self):
-        return torch.BFloat16Tensor
-
-    @property
-    def ByteTensor(self):
-        return torch.ByteTensor
-
-    @property
-    def DoubleTensor(self):
-        return torch.DoubleTensor
-
-    @property
-    def FloatTensor(self):
-        return torch.FloatTensor
-
-    @property
-    def HalfTensor(self):
-        return torch.HalfTensor
-
-    @property
-    def IntTensor(self):
-        return torch.IntTensor
-
-    @property
-    def LongTensor(self):
-        return torch.LongTensor
-
-    def pin_memory(self, tensor, align_bytes=1):
-        return tensor
-
-    def is_pinned(self, tensor):
-        return tensor.is_pinned()
-
-    def op_builder_dir(self):
-        try:
-            # is op_builder from apex or a 3p version? this should only succeed if it's apex
-            # if successful this also means we're doing a local install and not JIT compile path
-            from op_builder import __apex__  # noqa: F401 # type: ignore
-            return "op_builder"
-        except ImportError:
-            return "apex.op_builder"
-
-    def on_accelerator(self, tensor):
-        device_str = str(tensor.device)
-        if device_str.startswith('cpu'):
-            return True
-        else:
-            return False
-
-    # create an instance of op builder and return, name specified by class_name
-    def create_op_builder(self, class_name):
-        self._lazy_init_class_dict()
-        if class_name in self.class_dict:
-            return self.class_dict[class_name]()
-        else:
-            return None
-
-
-    # dict that holds class name <--> class type mapping i.e.
-    # 'AsyncIOBuilder': <class 'op_builder.async_io.AsyncIOBuilder'>
-    # this dict will be filled at init stage
-    class_dict = None
-
-    def _lazy_init_class_dict(self):
-        if self.class_dict is not None:
-            return
-        else:
-            self.class_dict = {}
-            # begin initialize for create_op_builder()
-            # put all valid class name <--> class type mapping into class_dict
-            op_builder_dir = self.op_builder_dir()
-            op_builder_module = importlib.import_module(op_builder_dir)
-            op_builder_absolute_path = os.path.dirname(op_builder_module.__file__)
-            for _, module_name, _ in pkgutil.iter_modules([op_builder_absolute_path]):
-                # avoid self references,
-                # skip sub_directories which contains ops for other backend(cpu, npu, etc.).
-                if module_name != 'all_ops' and module_name != 'builder' and not os.path.isdir(
-                        os.path.join(op_builder_absolute_path, module_name)):
-                    module = importlib.import_module("{}.{}".format(op_builder_dir, module_name))
-                    for member_name in module.__dir__():
-                        if member_name.endswith(
-                                'Builder'
-                        ) and member_name != "OpBuilder" and member_name != "CUDAOpBuilder" and member_name != "TorchCPUOpBuilder":  # avoid abstract classes
-                            if not member_name in self.class_dict:
-                                self.class_dict[member_name] = getattr(module, member_name)
-            # end initialize for create_op_builder()
-
-    # return an op builder class, name specified by class_name
-    def get_op_builder(self, class_name):
-        self._lazy_init_class_dict()
-        if class_name in self.class_dict:
-            return self.class_dict[class_name]
-        else:
-            return None
-
-    def build_extension(self):
-        from torch.utils.cpp_extension import BuildExtension
-        return BuildExtension
-
-    def export_envs(self):
-        return []
-
-    # TODO: cpu's visible envs is confirmed, keep as CUDA_VISIBLE_DEVICES
-    def visible_devices_envs(self):
-        return ['CUDA_VISIBLE_DEVICES']
-
-    def set_visible_devices_envs(self, current_env, local_accelerator_ids):
-        for env in self.visible_devices_envs():
-            current_env[env] = ",".join(map(str, local_accelerator_ids))
-
-    def get_compile_backend(self):
-        return self._compile_backend
-
-    def set_compile_backend(self, backend):
-        supported_backends = torch._dynamo.list_backends(exclude_tags=())
-        if backend in supported_backends:
-            self._compile_backend = backend
-        else:
-            raise ValueError(
-                f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}")
\ No newline at end of file
diff --git a/accelerator/cuda_accelerator.py b/accelerator/cuda_accelerator.py
deleted file mode 100644
index 6eb11c390..000000000
--- a/accelerator/cuda_accelerator.py
+++ /dev/null
@@ -1,385 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-
-import functools
-import os
-import pkgutil
-import importlib
-import sys
-
-from .abstract_accelerator import ApexAccelerator
-# During setup stage torch may not be installed, pass on no torch will
-# allow op builder related API to be executed.
-try:
-    import torch.cuda
-except ImportError:
-    pass
-
-# Delay import pynvml to avoid import error when CUDA is not available
-pynvml = None
-
-
-class CUDA_Accelerator(ApexAccelerator):
-
-    def __init__(self):
-        self._name = 'cuda'
-        self._communication_backend_name = 'nccl' if sys.platform != 'win32' else 'gloo'
-        self._compile_backend = "inductor"
-        if pynvml is None:
-            self._init_pynvml()
-
-    def _init_pynvml(self):
-        global pynvml
-        try:
-            import pynvml
-        except ImportError:
-            return
-        try:
-            pynvml.nvmlInit()
-        except pynvml.NVMLError:
-            pynvml = None
-            return
-
-    def is_synchronized_device(self):
-        return False
-
-    def use_host_timers(self):
-        return self.is_synchronized_device()
-
-    def resolves_data_dependency(self):
-        return self.is_synchronized_device()
-
-    def handles_memory_backpressure(self):
-        return self.is_synchronized_device()
-
-    # Device APIs
-    def device_name(self, device_index=None):
-        if device_index is None:
-            return 'cuda'
-        return 'cuda:{}'.format(device_index)
-
-    def communication_backend_version(self):
-        return torch.cuda.nccl.version()
-
-    def device(self, device_index=None):
-        return torch.cuda.device(device_index)
-
-    def set_device(self, device_index):
-        torch.cuda.set_device(device_index)
-
-    def current_device(self):
-        return torch.cuda.current_device()
-
-    def current_device_name(self):
-        return 'cuda:{}'.format(torch.cuda.current_device())
-
-    def device_count(self):
-        return torch.cuda.device_count()
-
-    def synchronize(self, device_index=None):
-        return torch.cuda.synchronize(device_index)
-
-    # RNG APIs
-    def random(self):
-        return torch.random
-
-    def set_rng_state(self, new_state, device_index=None):
-        if device_index is None:
-            return torch.cuda.set_rng_state(new_state)
-
-        return torch.cuda.set_rng_state(new_state, device_index)
-
-    def get_rng_state(self, device_index=None):
-        if device_index is None:
-            return torch.cuda.get_rng_state()
-
-        return torch.cuda.get_rng_state(device_index)
-
-    def manual_seed(self, seed):
-        return torch.cuda.manual_seed(seed)
-
-    def manual_seed_all(self, seed):
-        return torch.cuda.manual_seed_all(seed)
-
-    def initial_seed(self):
-        return torch.cuda.initial_seed()
-
-    def default_generator(self, device_index):
-        return torch.cuda.default_generators[device_index]
-
-    # Streams/Events
-    @property
-    def Stream(self):
-        return torch.cuda.Stream
-
-    def stream(self, stream):
-        return torch.cuda.stream(stream)
-
-    def current_stream(self, device_index=None):
-        return torch.cuda.current_stream(device_index)
-
-    def default_stream(self, device_index=None):
-        return torch.cuda.default_stream(device_index)
-
-    @property
-    def Event(self):
-        return torch.cuda.Event
-
-    # Memory management
-    def empty_cache(self):
-        return torch.cuda.empty_cache()
-
-    def memory_allocated(self, device_index=None):
-        return torch.cuda.memory_allocated(device_index)
-
-    def max_memory_allocated(self, device_index=None):
-        return torch.cuda.max_memory_allocated(device_index)
-
-    def reset_max_memory_allocated(self, device_index=None):
-        return torch.cuda.reset_max_memory_allocated(device_index)
-
-    def memory_cached(self, device_index=None):
-        return torch.cuda.memory_cached(device_index)
-
-    def max_memory_cached(self, device_index=None):
-        return torch.cuda.max_memory_cached(device_index)
-
-    def reset_max_memory_cached(self, device_index=None):
-        return torch.cuda.reset_max_memory_cached(device_index)
-
-    def memory_stats(self, device_index=None):
-        if hasattr(torch.cuda, 'memory_stats'):
-            return torch.cuda.memory_stats(device_index)
-
-    def reset_peak_memory_stats(self, device_index=None):
-        if hasattr(torch.cuda, 'reset_peak_memory_stats'):
-            return torch.cuda.reset_peak_memory_stats(device_index)
-
-    def memory_reserved(self, device_index=None):
-        if hasattr(torch.cuda, 'memory_reserved'):
-            return torch.cuda.memory_reserved(device_index)
-
-    def max_memory_reserved(self, device_index=None):
-        if hasattr(torch.cuda, 'max_memory_reserved'):
-            return torch.cuda.max_memory_reserved(device_index)
-
-    def total_memory(self, device_index=None):
-        return torch.cuda.get_device_properties(device_index).total_memory
-
-    def _get_nvml_gpu_id(self, torch_gpu_id):
-        """
-        credit: https://discuss.pytorch.org/t/making-pynvml-match-torch-device-ids-cuda-visible-devices/103020
-
-        Remap torch device id to nvml device id, respecting CUDA_VISIBLE_DEVICES.
-
-        If the latter isn't set return the same id
-        """
-        # if CUDA_VISIBLE_DEVICES is used automagically remap the id since pynvml ignores this env var
-        if "CUDA_VISIBLE_DEVICES" in os.environ:
-            ids = list(map(int, os.environ.get("CUDA_VISIBLE_DEVICES", "").split(",")))
-            return ids[torch_gpu_id]  # remap
-        else:
-            return torch_gpu_id
-
-    def available_memory(self, device_index=None):
-        if pynvml:
-            if device_index is None:
-                device_index = self.current_device()
-            handle = pynvml.nvmlDeviceGetHandleByIndex(self._get_nvml_gpu_id(device_index))
-            info = pynvml.nvmlDeviceGetMemoryInfo(handle)
-            return info.free
-        else:
-            return self.total_memory(device_index) - self.memory_allocated(device_index)
-
-    # Data types
-    def is_bf16_supported(self):
-        if not torch.cuda.is_available():
-            return True
-        return torch.cuda.is_bf16_supported()
-
-    def is_fp16_supported(self):
-        if not torch.cuda.is_available():
-            return True
-        # See https://docs.nvidia.com/deeplearning/tensorrt/support-matrix/index.html#hardware-precision-matrix
-        # FP16 on compute capability 6.x is deprecated
-        allow_deprecated_fp16 = os.environ.get('APEX_ALLOW_DEPRECATED_FP16', '0') == '1'
-        major, _ = torch.cuda.get_device_capability()
-        if major >= 7:
-            return True
-        elif major == 6 and allow_deprecated_fp16:
-            return True
-        else:
-            return False
-
-    def supported_dtypes(self):
-        supported_dtypes = [torch.float]
-        if self.is_fp16_supported():
-            supported_dtypes.append(torch.half)
-        if self.is_bf16_supported():
-            supported_dtypes.append(torch.bfloat16)
-        return supported_dtypes
-
-    # Misc
-    def amp(self):
-        if hasattr(torch.cuda, 'amp'):
-            return torch.cuda.amp
-        return None
-
-    def is_available(self):
-        return torch.cuda.is_available()
-
-    def range_push(self, msg):
-        if hasattr(torch.cuda.nvtx, 'range_push'):
-            return torch.cuda.nvtx.range_push(msg)
-
-    def range_pop(self):
-        if hasattr(torch.cuda.nvtx, 'range_pop'):
-            return torch.cuda.nvtx.range_pop()
-
-    def lazy_call(self, callback):
-        return torch.cuda._lazy_call(callback)
-
-    def communication_backend_name(self):
-        return self._communication_backend_name
-
-    def is_triton_supported(self):
-        major, _ = torch.cuda.get_device_capability()
-        if major >= 8:
-            return True
-        else:
-            return False
-
-    # Graph operations
-    def create_graph(self):
-        return torch.cuda.CUDAGraph()
-
-    def capture_to_graph(self, graph, pool=None, stream=None):
-        return torch.cuda.graph(graph, pool, stream)
-
-    def replay_graph(self, graph):
-        graph.replay()
-        return
-
-    # Tensor operations
-
-    @property
-    def BFloat16Tensor(self):
-        return functools.partial(torch.tensor, dtype=torch.bfloat16, device='cuda')
-
-    @property
-    def ByteTensor(self):
-        return functools.partial(torch.tensor, dtype=torch.uint8, device='cuda')
-
-    @property
-    def DoubleTensor(self):
-        return functools.partial(torch.tensor, dtype=torch.double, device='cuda')
-
-    @property
-    def FloatTensor(self):
-        return functools.partial(torch.tensor, dtype=torch.float, device='cuda')
-
-    @property
-    def HalfTensor(self):
-        return functools.partial(torch.tensor, dtype=torch.half, device='cuda')
-
-    @property
-    def IntTensor(self):
-        return functools.partial(torch.tensor, dtype=torch.int, device='cuda')
-
-    @property
-    def LongTensor(self):
-        return functools.partial(torch.tensor, dtype=torch.long, device='cuda')
-
-    def pin_memory(self, tensor, align_bytes=1):
-        return tensor.pin_memory()
-
-    def is_pinned(self, tensor):
-        return tensor.is_pinned()
-
-    def on_accelerator(self, tensor):
-        device_str = str(tensor.device)
-        if device_str.startswith('cuda:'):
-            return True
-        else:
-            return False
-
-    def op_builder_dir(self):
-        try:
-            # is op_builder from apex or a 3p version? this should only succeed if it's apex
-            # if successful this also means we're doing a local install and not JIT compile path
-            from op_builder import __apex__  # noqa: F401 # type: ignore
-            return "op_builder"
-        except ImportError:
-            return "apex.op_builder"
-
-    # dict that holds class name <--> class type mapping i.e.
-    # 'AsyncIOBuilder': <class 'op_builder.async_io.AsyncIOBuilder'>
-    # this dict will be filled at init stage
-    class_dict = None
-
-    def _lazy_init_class_dict(self):
-        if self.class_dict is not None:
-            return
-        else:
-            self.class_dict = {}
-            # begin initialize for create_op_builder()
-            # put all valid class name <--> class type mapping into class_dict
-            op_builder_dir = self.op_builder_dir()
-            op_builder_module = importlib.import_module(op_builder_dir)
-            op_builder_absolute_path = os.path.dirname(op_builder_module.__file__)
-            for _, module_name, _ in pkgutil.iter_modules([op_builder_absolute_path]):
-                # avoid self references,
-                # skip sub_directories which contains ops for other backend(cpu, npu, etc.).
-                if module_name != 'all_ops' and module_name != 'builder' and not os.path.isdir(
-                        os.path.join(op_builder_absolute_path, module_name)):
-                    module = importlib.import_module("{}.{}".format(op_builder_dir, module_name))
-                    for member_name in module.__dir__():
-                        if member_name.endswith(
-                                'Builder'
-                        ) and member_name != "OpBuilder" and member_name != "CUDAOpBuilder" and member_name != "TorchCPUOpBuilder":  # avoid abstract classes
-                            if not member_name in self.class_dict:
-                                self.class_dict[member_name] = getattr(module, member_name)
-            # end initialize for create_op_builder()
-
-    # create an instance of op builder and return, name specified by class_name
-    def create_op_builder(self, class_name):
-        self._lazy_init_class_dict()
-        if class_name in self.class_dict:
-            return self.class_dict[class_name]()
-        else:
-            return None
-
-    # return an op builder class, name specified by class_name
-    def get_op_builder(self, class_name):
-        self._lazy_init_class_dict()
-        if class_name in self.class_dict:
-            return self.class_dict[class_name]
-        else:
-            return None
-
-    def build_extension(self):
-        from torch.utils.cpp_extension import BuildExtension
-        return BuildExtension
-
-    def export_envs(self):
-        return ['NCCL']
-
-    def visible_devices_envs(self):
-        return ['CUDA_VISIBLE_DEVICES']
-
-    def set_visible_devices_envs(self, current_env, local_accelerator_ids):
-        for env in self.visible_devices_envs():
-            current_env[env] = ",".join(map(str, local_accelerator_ids))
-
-    def get_compile_backend(self):
-        return self._compile_backend
-
-    def set_compile_backend(self, backend):
-        supported_backends = torch._dynamo.list_backends(exclude_tags=())
-        if backend in supported_backends:
-            self._compile_backend = backend
-        else:
-            raise ValueError(
-                f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}")
\ No newline at end of file
diff --git a/accelerator/logging.py b/accelerator/logging.py
deleted file mode 100644
index d2074cc1a..000000000
--- a/accelerator/logging.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-import functools
-import logging
-import sys
-import os
-
-log_levels = {
-    "debug": logging.DEBUG,
-    "info": logging.INFO,
-    "warning": logging.WARNING,
-    "error": logging.ERROR,
-    "critical": logging.CRITICAL,
-}
-
-
-class LoggerFactory:
-
-    @staticmethod
-    def create_logger(name=None, level=logging.INFO):
-        """create a logger
-
-        Args:
-            name (str): name of the logger
-            level: level of logger
-
-        Raises:
-            ValueError is name is None
-        """
-
-        if name is None:
-            raise ValueError("name for logger cannot be None")
-
-        formatter = logging.Formatter("[%(asctime)s] [%(levelname)s] "
-                                      "[%(filename)s:%(lineno)d:%(funcName)s] %(message)s")
-
-        logger_ = logging.getLogger(name)
-        logger_.setLevel(level)
-        logger_.propagate = False
-        ch = logging.StreamHandler(stream=sys.stdout)
-        ch.setLevel(level)
-        ch.setFormatter(formatter)
-        logger_.addHandler(ch)
-        return logger_
-
-
-logger = LoggerFactory.create_logger(name="apex", level=logging.INFO)
-
-
-@functools.lru_cache(None)
-def warning_once(*args, **kwargs):
-    """
-    This method is identical to `logger.warning()`, but will emit the warning with the same message only once
-
-    Note: The cache is for the function arguments, so 2 different callers using the same arguments will hit the cache.
-    The assumption here is that all warning messages are unique across the code. If they aren't then need to switch to
-    another type of cache that includes the caller frame information in the hashing function.
-    """
-    logger.warning(*args, **kwargs)
-
-
-logger.warning_once = warning_once
-
-
-def print_configuration(args, name):
-    logger.info("{}:".format(name))
-    for arg in sorted(vars(args)):
-        dots = "." * (29 - len(arg))
-        logger.info("  {} {} {}".format(arg, dots, getattr(args, arg)))
-
-
-def log_dist(message, ranks=None, level=logging.INFO):
-    from deepspeed import comm as dist
-    """Log message when one of following condition meets
-
-    + not dist.is_initialized()
-    + dist.get_rank() in ranks if ranks is not None or ranks = [-1]
-
-    Args:
-        message (str)
-        ranks (list)
-        level (int)
-
-    """
-    should_log = not dist.is_initialized()
-    ranks = ranks or []
-    my_rank = dist.get_rank() if dist.is_initialized() else -1
-    if ranks and not should_log:
-        should_log = ranks[0] == -1
-        should_log = should_log or (my_rank in set(ranks))
-    if should_log:
-        final_message = "[Rank {}] {}".format(my_rank, message)
-        logger.log(level, final_message)
-
-
-def print_json_dist(message, ranks=None, path=None):
-    from deepspeed import comm as dist
-    """Print message when one of following condition meets
-
-    + not dist.is_initialized()
-    + dist.get_rank() in ranks if ranks is not None or ranks = [-1]
-
-    Args:
-        message (str)
-        ranks (list)
-        path (str)
-
-    """
-    should_log = not dist.is_initialized()
-    ranks = ranks or []
-    my_rank = dist.get_rank() if dist.is_initialized() else -1
-    if ranks and not should_log:
-        should_log = ranks[0] == -1
-        should_log = should_log or (my_rank in set(ranks))
-    if should_log:
-        message['rank'] = my_rank
-        import json
-        with open(path, 'w') as outfile:
-            json.dump(message, outfile)
-            os.fsync(outfile)
-
-
-def get_current_level():
-    """
-    Return logger's current log level
-    """
-    return logger.getEffectiveLevel()
-
-
-def should_log_le(max_log_level_str):
-    """
-    Args:
-        max_log_level_str: maximum log level as a string
-
-    Returns ``True`` if the current log_level is less or equal to the specified log level. Otherwise ``False``.
-
-    Example:
-
-        ``should_log_le("info")`` will return ``True`` if the current log level is either ``logging.INFO`` or ``logging.DEBUG``
-    """
-
-    if not isinstance(max_log_level_str, str):
-        raise ValueError(f"{max_log_level_str} is not a string")
-
-    max_log_level_str = max_log_level_str.lower()
-    if max_log_level_str not in log_levels:
-        raise ValueError(f"{max_log_level_str} is not one of the `logging` levels")
-
-    return get_current_level() <= log_levels[max_log_level_str]
\ No newline at end of file
diff --git a/accelerator/numa.py b/accelerator/numa.py
deleted file mode 100644
index 08f277490..000000000
--- a/accelerator/numa.py
+++ /dev/null
@@ -1,202 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-# return a list of list for cores to numa mapping
-# [
-#     [ cores for numa 0 ]
-#     [ cores belong to numa 1 ]
-#     ...
-# ]
-
-import distutils
-import os
-import psutil
-import subprocess
-
-
-# return a list of list for cores to numa mapping
-# [
-#     [ cores for numa 0 ]
-#     [ cores belong to numa 1 ]
-#     ...
-# ]
-def get_numa_cores():
-    ret = []
-    output = subprocess.check_output(['numactl', '--hardware']).decode("utf-8")
-    lines = output.split('\n')
-    for line in lines:
-        if line.startswith('available:'):
-            num_numas = int(line.split(' ')[1])
-            break
-    for numa in range(num_numas):
-        for line in lines:
-            if line.startswith(f'node {numa} cpus:'):
-                cores = line.split(' ')[3:]
-                ret.append([int(core) for core in cores])
-    return ret
-
-
-def check_for_numactl_pkg():
-    libs = dict(
-        dpkg=["-l", "numactl", "apt"],
-        pacman=["-Q", "numactl", "pacman"],
-        rpm=["-q", "numactl", "yum"],
-    )
-
-    found = False
-    for pkgmgr, data in libs.items():
-        flag, lib, tool = data
-        path = distutils.spawn.find_executable(pkgmgr)
-        if path is not None:
-            cmd = [pkgmgr, flag, lib]
-            result = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-            if result.wait() == 0:
-                found = True
-            else:
-                print(f"please install the {lib} package with {tool}")
-            break
-    return found
-
-
-def parse_range(rng):
-    try:
-        value = int(rng)
-        return range(value, value + 1)
-    except ValueError:
-        # value is not a single number
-        parts = rng.split('-')
-        if len(parts) != 2:
-            raise ValueError("Bad range: '%s', range must be either a number or two number separated by dash" %
-                             (rng, ))
-        start = int(parts[0])
-        end = int(parts[1])
-        if start > end:
-            raise ValueError("Bad range: '%s', range end must larger than or equal to start" % (rng, ))
-        return range(start, end + 1)
-
-
-# parse comma and dash separated range list into list
-# i.e. "0,2-4,6" --> [0, 2, 3, 4, 6]
-# rules:
-# 1. Range list number be comma separated, each item are either a single number,
-#    or a range marked by two numbers (both number are included in the range)
-# 2. Sub ranges must be in ascend order and not overlap with each other
-# 3. No space in the range expression
-def parse_range_list(range_str):
-    number_list = []
-    last = -1
-    range_list = range_str.split(',')
-    for sub_range in range_list:
-        sub_number_list = parse_range(sub_range)
-        if sub_number_list[0] <= last:
-            raise ValueError(
-                "Bad range: '%s', sub ranges must not overlap with each other and should be in ascend order" %
-                (range_str, ))
-        last = sub_number_list[-1]
-        number_list.extend(sub_number_list)
-    return number_list
-
-
-def get_numactl_cmd(bind_core_list, num_local_procs, local_rank):
-    numactl_cmd = []
-    check_for_numactl_pkg()
-    if 'KMP_AFFINITY' in os.environ.keys():
-        raise ValueError("Environment variable KMP_AFFINITY conflicts with numactl "
-                         "because it interfere with how many CPU cores numactl can set. "
-                         "Unset KMP_AFFINITY before launching deepspeed.\n\n"
-                         "\t$ unset KMP_AFFINITY\n"
-                         "\t$ deepspeed <deepspeed command parameters>")
-    if bind_core_list is not None:
-        core_list = parse_range_list(bind_core_list)
-        total_cores = len(core_list)
-    else:
-        total_cores = psutil.cpu_count(logical=False)
-        core_list = range(total_cores)
-    cores_per_rank = total_cores // num_local_procs
-    assert cores_per_rank >= 1, "At least one core needs to be assigned to each rank"
-    core_list_for_rank = core_list[cores_per_rank * local_rank:cores_per_rank * (local_rank + 1)]
-    numactl_cmd.append("numactl")
-
-    # check if all cores belong to same numa, if true, bind process to that numa domain with -m parameter
-    numa_cores = get_numa_cores()
-    num_numas = len(numa_cores)
-
-    numa_mode = "normal"
-
-    non_empty_numa_list = []
-    empty_numa_list = []
-    previous_numa_cores = []
-    numa_node_list = []
-    numa_node_list_list = []
-    for i in range(num_numas):
-        # look for empty numa which is HBM numa
-        if numa_cores[i] == []:
-            empty_numa_list.append(i)
-        else:
-            non_empty_numa_list.append(i)
-
-            # check for fakenuma
-            if numa_cores[i] == previous_numa_cores:
-                if numa_node_list == []:
-                    #first duplication, add previous node into list
-                    numa_node_list.append(i - 1)
-                numa_node_list.append(i)
-            else:
-                if numa_node_list != []:
-                    numa_node_list_list.append(numa_node_list)
-                    numa_node_list = []
-        previous_numa_cores = numa_cores[i]
-    if numa_node_list != []:
-        numa_node_list_list.append(numa_node_list)
-
-    if empty_numa_list != [] and len(empty_numa_list) == len(non_empty_numa_list):
-        numa_mode = "flat_hbm"
-        numa_dict = dict(zip(non_empty_numa_list, empty_numa_list))
-    elif numa_node_list_list != []:
-        numa_mode = "fake"
-
-    if numa_mode == "normal":
-        for i in range(num_numas):
-            if set(core_list_for_rank) <= set(numa_cores[i]):
-                numactl_cmd.append("-m")
-                numactl_cmd.append(f"{i}")
-                break
-    elif numa_mode == "flat_hbm":
-        for i in range(num_numas):
-            if set(core_list_for_rank) <= set(numa_cores[i]):
-                numactl_cmd.append("-p")
-                numactl_cmd.append(f"{numa_dict[i]}")
-                break
-    elif numa_mode == "fake":
-        for i in range(num_numas):
-            if set(core_list_for_rank) <= set(numa_cores[i]):
-                for nodes in numa_node_list_list:
-                    if i in nodes:
-                        numactl_cmd.append("-m")
-                        numactl_cmd.append(f"{','.join(map(str, nodes))}")
-                        break
-                # the following construct break the outer loop if inner loop breaks
-                else:
-                    continue
-                break
-
-    numactl_cmd.append("-C")
-    last_core = core_list_for_rank[0]
-    first_core = last_core
-    core_list_str = f"{last_core}"
-    for core_id in core_list_for_rank[1:]:
-        if core_id == last_core + 1:
-            last_core = core_id
-            continue
-        else:
-            if first_core == last_core:
-                core_list_str = f"{core_list_str},{core_id}"
-            else:
-                core_list_str = f"{core_list_str}-{last_core},{core_id}"
-            first_core = core_id
-            last_core = core_id
-    if first_core != last_core:
-        core_list_str = f"{core_list_str}-{last_core}"
-    numactl_cmd.append(f"{core_list_str}")
-    return cores_per_rank, numactl_cmd
\ No newline at end of file
diff --git a/accelerator/real_accelerator.py b/accelerator/real_accelerator.py
deleted file mode 100644
index e4092a5e8..000000000
--- a/accelerator/real_accelerator.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-import os
-
-try:
-    # Importing logger currently requires that torch is installed, hence the try...except
-    # TODO: Remove logger dependency on torch.
-    from apex.accelerator.logging import logger as accel_logger
-except ImportError as e:
-    accel_logger = None
-
-try:
-    from accelerator.abstract_accelerator import ApexAccelerator as dsa1
-except ImportError as e:
-    dsa1 = None
-try:
-    from apex.accelerator.abstract_accelerator import ApexAccelerator as dsa2
-except ImportError as e:
-    dsa2 = None
-
-SUPPORTED_ACCELERATOR_LIST = ['cuda', 'cpu']
-
-ds_accelerator = None
-
-
-def _validate_accelerator(accel_obj):
-    # because abstract_accelerator has different path during
-    # build time (accelerator.abstract_accelerator)
-    # and run time (apex.accelerator.abstract_accelerator)
-    # and extension would import the
-    # run time abstract_accelerator/apex as its base
-    # class, so we need to compare accel_obj with both base class.
-    # if accel_obj is instance of ApexAccelerator in one of
-    # accelerator.abstractor_accelerator
-    # or apex.accelerator.abstract_accelerator, consider accel_obj
-    # is a conforming object
-    if not ((dsa1 is not None and isinstance(accel_obj, dsa1)) or (dsa2 is not None and isinstance(accel_obj, dsa2))):
-        raise AssertionError(f"{accel_obj.__class__.__name__} accelerator is not subclass of ApexAccelerator")
-
-    # TODO: turn off is_available test since this breaks tests
-    # assert accel_obj.is_available(), \
-    #    f'{accel_obj.__class__.__name__} accelerator fails is_available() test'
-
-
-def is_current_accelerator_supported():
-    return get_accelerator().device_name() in SUPPORTED_ACCELERATOR_LIST
-
-
-def get_accelerator():
-    global ds_accelerator
-    if ds_accelerator is not None:
-        return ds_accelerator
-
-    accelerator_name = None
-    ds_set_method = None
-    # 1. Detect whether there is override of apex accelerators from environment variable.
-    if "APEX_ACCELERATOR" in os.environ.keys():
-        accelerator_name = os.environ["APEX_ACCELERATOR"]
-        if accelerator_name == "cpu":
-            pass 
-        elif accelerator_name not in SUPPORTED_ACCELERATOR_LIST:
-            raise ValueError(f'APEX_ACCELERATOR must be one of {SUPPORTED_ACCELERATOR_LIST}. '
-                             f'Value "{accelerator_name}" is not supported')
-        ds_set_method = "override"
-
-    # 2. If no override, detect which accelerator to use automatically
-    if accelerator_name is None:
-        # We need a way to choose among different accelerator types.
-        # Currently we detect which accelerator extension is installed
-        # in the environment and use it if the installing answer is True.
-        # An alternative might be detect whether CUDA device is installed on
-        # the system but this comes with two pitfalls:
-        # 1. the system may not have torch pre-installed, so
-        #    get_accelerator().is_available() may not work.
-        # 2. Some scenario like install on login node (without CUDA device)
-        #    and run on compute node (with CUDA device) may cause mismatch
-        #    between installation time and runtime.
-
-        if accelerator_name is None:
-            try:
-                import torch
-
-                # Determine if we are on a GPU or x86 CPU with torch.
-                # "torch.cuda.is_available()" provides a stronger guarantee,     #ignore-cuda
-                # ensuring that we are free from CUDA initialization errors.
-                # While "torch.cuda.device_count() > 0" check ensures that       #ignore-cuda
-                # we won't try to do any CUDA calls when no device is available
-                # For reference: https://github.com/deepspeedai/DeepSpeed/pull/6810
-                if torch.cuda.device_count() > 0 and torch.cuda.is_available():  #ignore-cuda
-                    accelerator_name = "cuda"
-            except (RuntimeError, ImportError) as e:
-                # TODO need a more decent way to detect which accelerator to use, consider using nvidia-smi command for detection
-                pass
-        if accelerator_name is None:
-            # borrow this log from PR#5084
-            if accel_logger is not None:
-                accel_logger.warning(
-                    "Setting accelerator to CPU. If you have GPU or other accelerator, we were unable to detect it.")
-            # cpu added as catch-all when accelerator detection fails
-            accelerator_name = "cpu"
-
-        ds_set_method = "auto detect"
-
-    # 3. Set ds_accelerator accordingly
-    if accelerator_name == "cuda":
-        from .cuda_accelerator import CUDA_Accelerator
-
-        ds_accelerator = CUDA_Accelerator()
-    elif accelerator_name == "cpu":
-        from .cpu_accelerator import CPU_Accelerator
-
-        ds_accelerator = CPU_Accelerator()
-    _validate_accelerator(ds_accelerator)
-    if accel_logger is not None:
-        accel_logger.info(f"Setting apex_accelerator to {ds_accelerator._name} ({ds_set_method})")
-    return ds_accelerator
-
-
-def set_accelerator(accel_obj):
-    global ds_accelerator
-    _validate_accelerator(accel_obj)
-    if accel_logger is not None:
-        accel_logger.info(f"Setting apex_accelerator to {accel_obj._name} (model specified)")
-    ds_accelerator = accel_obj
diff --git a/accelerator/utils.py b/accelerator/utils.py
deleted file mode 100644
index bb6ec2d7c..000000000
--- a/accelerator/utils.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""
-Copyright NVIDIA/Megatron
-
-Helper functions and classes from multiple sources.
-"""
-
-class noop_context(object):
-
-    def __init__(self):
-        pass
-
-    def __enter__(self):
-        pass
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        pass
\ No newline at end of file
diff --git a/apex/git_version_info.py b/apex/git_version_info.py
index 1a2b76cd1..3b20d4d39 100644
--- a/apex/git_version_info.py
+++ b/apex/git_version_info.py
@@ -18,7 +18,6 @@
 
     from .op_builder.all_ops import ALL_OPS
     installed_ops = dict.fromkeys(ALL_OPS.keys(), False)
-    accelerator_name = ""
     torch_info = {'version': "0.0", "cuda_version": "0.0", "hip_version": "0.0"}
 
 # compatible_ops list is recreated for each launch
diff --git a/op_builder/__init__.py b/op_builder/__init__.py
index cc7ea9765..6354f0559 100644
--- a/op_builder/__init__.py
+++ b/op_builder/__init__.py
@@ -30,15 +30,15 @@ def builder_closure(member_name):
         # during installation time cannot get builder due to torch not installed,
         # return closure instead
         def _builder():
-            from apex.accelerator import get_accelerator
-            builder = get_accelerator().create_op_builder(member_name)
+            from apex.op_builder.all_ops import BuilderUtils
+            builder = BuilderUtils().create_op_builder(member_name)
             return builder
 
         return _builder
     else:
         # during runtime, return op builder class directly
-        from apex.accelerator import get_accelerator
-        builder = get_accelerator().get_op_builder(member_name)
+        from apex.op_builder.all_ops import BuilderUtils
+        builder = BuilderUtils().get_op_builder(member_name)
         return builder
 
 
diff --git a/op_builder/all_ops.py b/op_builder/all_ops.py
index 7de5ae8e6..273914312 100644
--- a/op_builder/all_ops.py
+++ b/op_builder/all_ops.py
@@ -6,16 +6,67 @@
 import os
 import pkgutil
 import importlib
-try:
-    # during installation time accelerator is visible, otherwise return apex.accelerator
-    from accelerator import get_accelerator
-except ImportError:
-    from apex.accelerator import get_accelerator
+
+class BuilderUtils:
+    def op_builder_dir(self):
+        try:
+            # is op_builder from apex or a 3p version? this should only succeed if it's apex
+            # if successful this also means we're doing a local install and not JIT compile path
+            from op_builder import __apex__  # noqa: F401 # type: ignore
+            return "op_builder"
+        except ImportError:
+            return "apex.op_builder"
+
+    # dict that holds class name <--> class type mapping i.e.
+    # 'AsyncIOBuilder': <class 'op_builder.async_io.AsyncIOBuilder'>
+    # this dict will be filled at init stage
+    class_dict = None
+
+    def _lazy_init_class_dict(self):
+        if self.class_dict is not None:
+            return
+        else:
+            self.class_dict = {}
+            # begin initialize for create_op_builder()
+            # put all valid class name <--> class type mapping into class_dict
+            op_builder_dir = self.op_builder_dir()
+            op_builder_module = importlib.import_module(op_builder_dir)
+            op_builder_absolute_path = os.path.dirname(op_builder_module.__file__)
+            for _, module_name, _ in pkgutil.iter_modules([op_builder_absolute_path]):
+                # avoid self references,
+                # skip sub_directories which contains ops for other backend(cpu, npu, etc.).
+                if module_name != 'all_ops' and module_name != 'builder' and not os.path.isdir(
+                        os.path.join(op_builder_absolute_path, module_name)):
+                    module = importlib.import_module("{}.{}".format(op_builder_dir, module_name))
+                    for member_name in module.__dir__():
+                        if member_name.endswith(
+                                'Builder'
+                        ) and member_name != "OpBuilder" and member_name != "CUDAOpBuilder" and member_name != "TorchCPUOpBuilder":  # avoid abstract classes
+                            if not member_name in self.class_dict:
+                                self.class_dict[member_name] = getattr(module, member_name)
+            # end initialize for create_op_builder()
+
+    # create an instance of op builder and return, name specified by class_name
+    def create_op_builder(self, class_name):
+        self._lazy_init_class_dict()
+        if class_name in self.class_dict:
+            return self.class_dict[class_name]()
+        else:
+            return None
+
+    # return an op builder class, name specified by class_name
+    def get_op_builder(self, class_name):
+        self._lazy_init_class_dict()
+        if class_name in self.class_dict:
+            return self.class_dict[class_name]
+        else:
+            return None
 
 # List of all available ops
 
 # append all builder names into __op_builders__
-op_builder_dir = get_accelerator().op_builder_dir()
+builder_utils = BuilderUtils()
+op_builder_dir = builder_utils.op_builder_dir()
 op_builder_module = importlib.import_module(op_builder_dir)
 __op_builders__ = []
 
@@ -26,8 +77,7 @@
         for member_name in module.__dir__():
             if member_name.endswith('Builder'):
                 # append builder to __op_builders__ list
-                builder = get_accelerator().create_op_builder(member_name)
+                builder = builder_utils.create_op_builder(member_name)
                 __op_builders__.append(builder)
                 
-ALL_OPS = {op.name: op for op in __op_builders__ if op is not None}
-accelerator_name = get_accelerator()._name
\ No newline at end of file
+ALL_OPS = {op.name: op for op in __op_builders__ if op is not None}
\ No newline at end of file
diff --git a/op_builder/builder.py b/op_builder/builder.py
index 4ee01f095..22bd091a9 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -524,9 +524,8 @@ def load(self, verbose=True):
         if self.name in __class__._loaded_ops:
             return __class__._loaded_ops[self.name]
 
-        from apex.git_version_info import installed_ops, torch_info, accelerator_name
-        from apex.accelerator import get_accelerator
-        if installed_ops.get(self.name, False) and accelerator_name == get_accelerator()._name:
+        from apex.git_version_info import installed_ops, torch_info
+        if installed_ops.get(self.name, False):
             # Ensure the op we're about to load was compiled with the same
             # torch/cuda versions we are currently using at runtime.
             self.validate_torch_version(torch_info)
diff --git a/setup.py b/setup.py
index 0353d77c2..f16df2322 100644
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,7 @@
 
 sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
 
-from op_builder.all_ops import ALL_OPS, accelerator_name
+from op_builder.all_ops import ALL_OPS
 
 # ninja build does not work unless include_dirs are abs path
 this_dir = os.path.dirname(os.path.abspath(__file__))
@@ -274,7 +274,6 @@ def is_op_included(op_name):
     fd.write(f"git_branch='{git_branch}'\n")
     fd.write(f"installed_ops={install_ops}\n")
     fd.write(f"build_flags={build_flags}\n")
-    fd.write(f"accelerator_name='{accelerator_name}'\n")
     fd.write(f"torch_info={torch_info}\n")
 
 if "--cpp_ext" in sys.argv:

From 6130493039f18475ea24cfd2f0b44a0798b14dc0 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Tue, 5 Aug 2025 21:54:48 +0000
Subject: [PATCH 54/79] remove accelerator package usages

---
 apex/accelerator       | 1 -
 op_builder/__init__.py | 3 +--
 setup.py               | 2 +-
 3 files changed, 2 insertions(+), 4 deletions(-)
 delete mode 120000 apex/accelerator

diff --git a/apex/accelerator b/apex/accelerator
deleted file mode 120000
index 14bf59231..000000000
--- a/apex/accelerator
+++ /dev/null
@@ -1 +0,0 @@
-../accelerator
\ No newline at end of file
diff --git a/op_builder/__init__.py b/op_builder/__init__.py
index 6354f0559..b60ca42ae 100644
--- a/op_builder/__init__.py
+++ b/op_builder/__init__.py
@@ -10,7 +10,6 @@
 
 from .builder import get_default_compute_capabilities, OpBuilder
 
-# Do not remove, required for abstract accelerator to detect if we have a apex or 3p op_builder
 __apex__ = True
 
 # List of all available op builders from apex op_builder
@@ -42,7 +41,7 @@ def _builder():
         return builder
 
 
-# reflect builder names and add builder closure, such as 'TransformerBuilder()' creates op builder wrt current accelerator
+# reflect builder names and add builder closure, such as 'TransformerBuilder()' creates op builder 
 for _, module_name, _ in pkgutil.iter_modules([os.path.dirname(this_module.__file__)]):
     if module_name != 'all_ops' and module_name != 'builder':
         module = importlib.import_module(f".{module_name}", package=op_builder_dir)
diff --git a/setup.py b/setup.py
index f16df2322..fce65fd77 100644
--- a/setup.py
+++ b/setup.py
@@ -289,7 +289,7 @@ def is_op_included(op_name):
     name="apex",
     version=get_apex_version(),
     packages=find_packages(
-        exclude=("build", "include", "tests", "dist", "docs", "tests", "examples", "apex.egg-info", "op_builder", "accelerator")
+        exclude=("build", "include", "tests", "dist", "docs", "tests", "examples", "apex.egg-info", "op_builder")
     ),
     description="PyTorch Extensions written by NVIDIA",
     ext_modules=ext_modules,

From 4de63aa1ffea92bcd690c6c4a83642f40528d458 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Tue, 5 Aug 2025 22:10:30 +0000
Subject: [PATCH 55/79] revert code that was removed by mistake

---
 op_builder/__init__.py | 4 ++--
 op_builder/all_ops.py  | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/op_builder/__init__.py b/op_builder/__init__.py
index b60ca42ae..5b1b484c9 100644
--- a/op_builder/__init__.py
+++ b/op_builder/__init__.py
@@ -40,8 +40,8 @@ def _builder():
         builder = BuilderUtils().get_op_builder(member_name)
         return builder
 
-
-# reflect builder names and add builder closure, such as 'TransformerBuilder()' creates op builder 
+# this is for the import statement such as 'from apex.op_builder import FusedLayerNormBuilder' to work
+# reflect builder names and add builder closure, such as 'apex.op_builder.FusedLayerNormBuilder()' creates op builder 
 for _, module_name, _ in pkgutil.iter_modules([os.path.dirname(this_module.__file__)]):
     if module_name != 'all_ops' and module_name != 'builder':
         module = importlib.import_module(f".{module_name}", package=op_builder_dir)
diff --git a/op_builder/all_ops.py b/op_builder/all_ops.py
index 273914312..41fa091b6 100644
--- a/op_builder/all_ops.py
+++ b/op_builder/all_ops.py
@@ -12,7 +12,7 @@ def op_builder_dir(self):
         try:
             # is op_builder from apex or a 3p version? this should only succeed if it's apex
             # if successful this also means we're doing a local install and not JIT compile path
-            from op_builder import __apex__  # noqa: F401 # type: ignore
+            from op_builder import __apex__
             return "op_builder"
         except ImportError:
             return "apex.op_builder"
@@ -68,6 +68,7 @@ def get_op_builder(self, class_name):
 builder_utils = BuilderUtils()
 op_builder_dir = builder_utils.op_builder_dir()
 op_builder_module = importlib.import_module(op_builder_dir)
+print ("op_builder_module", op_builder_module)
 __op_builders__ = []
 
 for _, module_name, _ in pkgutil.iter_modules([os.path.dirname(op_builder_module.__file__)]):

From 223ab1deb7f425e22c3a792f3e68dce491156351 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Wed, 6 Aug 2025 15:55:18 +0000
Subject: [PATCH 56/79] Cleaning up the setup file and renaming functions and
 variable to more readable names.

---
 setup.py | 41 ++++++++++++-----------------------------
 1 file changed, 12 insertions(+), 29 deletions(-)

diff --git a/setup.py b/setup.py
index fce65fd77..23cfb4b0b 100644
--- a/setup.py
+++ b/setup.py
@@ -161,17 +161,17 @@ def is_env_set(key):
     """
     return bool(os.environ.get(key, None))
 
-def op_envvar(op_name):
+def get_op_build_env_name(op_name):
     assert hasattr(ALL_OPS[op_name], 'BUILD_VAR'), \
         f"{op_name} is missing BUILD_VAR field"
     return ALL_OPS[op_name].BUILD_VAR
 
 
-def op_enabled(op_name):
-    env_var = op_envvar(op_name)
+def op_build_enabled(op_name):
+    env_var = get_op_build_env_name(op_name)
     return int(get_env_if_set(env_var, BUILD_OP_DEFAULT))
 
-def is_op_included(op_name):
+def is_op_build_included(op_name):
     #check if operation has BUILD_FLAG defined
     assert hasattr(ALL_OPS[op_name], 'INCLUDE_FLAG'), \
         f"{op_name} is missing INCLUDE_FLAG field"
@@ -183,22 +183,21 @@ def is_op_included(op_name):
 
 for op_name, builder in ALL_OPS.items():
     op_compatible = builder.is_compatible()
-    enabled = op_enabled(op_name) or is_op_included(op_name)
+    build_enabled = op_build_enabled(op_name) or is_op_build_included(op_name)
 
     # If op is requested but not available, throw an error.
-    if enabled and not op_compatible:
+    if build_enabled and not op_compatible:
+        env_var = get_op_build_env_name(op_name)
         builder.warning(f"Skip pre-compile of incompatible {op_name}; One can disable {op_name} with {env_var}=0")
-        if not is_env_set(env_var):
-            builder.warning(f"Skip pre-compile of incompatible {op_name}; One can disable {op_name} with {env_var}=0")
         continue
 
-    # If op is compatible but install is not enabled (JIT mode).
-    if IS_ROCM_PYTORCH and op_compatible and not enabled:
+    # If op is compatible but install is not build enabled (JIT mode).
+    if IS_ROCM_PYTORCH and op_compatible and not build_enabled:
         builder.hipify_extension()
 
-    # If op install enabled, add builder to extensions.
+    # If op build enabled, add builder to extensions.
     # Also check if corresponding flags are checked
-    if enabled and op_compatible:
+    if build_enabled and op_compatible:
         install_ops[op_name] = True
         ext_modules.append(builder.builder())
 
@@ -222,23 +221,7 @@ def is_op_included(op_name):
 
 # Parse the apex version string from version.txt.
 version_str = get_apex_version()
-
-# Build specifiers like .devX can be added at install time. Otherwise, add the git hash.
-# Example: `APEX_BUILD_STRING=".dev20201022" python -m build --no-isolation`.
-
-# Building wheel for distribution, update version file.
-if is_env_set('APEX_BUILD_STRING'):
-    # Build string env specified, probably building for distribution.
-    with open('build.txt', 'w') as fd:
-        fd.write(os.environ['APEX_BUILD_STRING'])
-    version_str += os.environ['APEX_BUILD_STRING']
-elif os.path.isfile('build.txt'):
-    # build.txt exists, probably installing from distribution.
-    with open('build.txt', 'r') as fd:
-        version_str += fd.read().strip()
-else:
-    # None of the above, probably installing from source.
-    version_str += f'+{git_hash}'
+version_str += f'+{git_hash}'
 
 torch_version = ".".join([str(TORCH_MAJOR), str(TORCH_MINOR)])
 bf16_support = False

From 954e7ce70a2bfd083df4f34e9ad86f9ec2b9b573 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Tue, 12 Aug 2025 20:30:05 +0000
Subject: [PATCH 57/79] Fix the nccl version so that the nccl_allocator.so file
 can be loaded properly.

Setup() call has an argument called py_modules which copies the python class into sitepackages folder. The python modules in the compatibility folder do lazy load of the builder classes. First these files are copied in the parent folder so that the files themselves are copied into sitepackages so that the kernel can be loaded into python then these temporary files are deleted.
---
 apex/fused_dense/fused_dense.py               |  4 +-
 compatibility/__init__.py                     |  0
 compatibility/_apex_nccl_allocator.py         | 37 ++++++++++++++++
 compatibility/amp_C.py                        | 37 ++++++++++++++++
 compatibility/apex_C.py                       | 37 ++++++++++++++++
 compatibility/bnp.py                          | 37 ++++++++++++++++
 compatibility/distributed_adam_cuda.py        | 37 ++++++++++++++++
 compatibility/distributed_lamb_cuda.py        | 37 ++++++++++++++++
 compatibility/fast_multihead_attn.py          | 37 ++++++++++++++++
 compatibility/focal_loss_cuda.py              | 37 ++++++++++++++++
 compatibility/fused_adam_cuda.py              | 37 ++++++++++++++++
 compatibility/fused_bias_swiglu.py            | 37 ++++++++++++++++
 compatibility/fused_dense_cuda.py             | 37 ++++++++++++++++
 compatibility/fused_index_mul_2d.py           | 37 ++++++++++++++++
 compatibility/fused_lamb_cuda.py              | 37 ++++++++++++++++
 compatibility/fused_layer_norm_cuda.py        | 44 +++++++++++++++++++
 .../fused_rotary_positional_embedding.py      | 37 ++++++++++++++++
 .../fused_weight_gradient_mlp_cuda.py         | 37 ++++++++++++++++
 .../generic_scaled_masked_softmax_cuda.py     | 37 ++++++++++++++++
 compatibility/mlp_cuda.py                     | 44 +++++++++++++++++++
 compatibility/nccl_p2p_cuda.py                | 37 ++++++++++++++++
 compatibility/peer_memory_cuda.py             | 37 ++++++++++++++++
 compatibility/scaled_masked-softmax_cuda.py   | 37 ++++++++++++++++
 compatibility/scaled_softmax_cuda.py          | 37 ++++++++++++++++
 ...scaled_upper_triang_masked_softmax_cuda.py | 38 ++++++++++++++++
 compatibility/syncbn.py                       | 37 ++++++++++++++++
 compatibility/transducer_joint_cuda.py        | 37 ++++++++++++++++
 compatibility/transducer_loss_cuda.py         | 37 ++++++++++++++++
 compatibility/xentropy_cuda.py                | 37 ++++++++++++++++
 op_builder/builder.py                         |  7 +--
 setup.py                                      | 29 +++++++++++-
 31 files changed, 1044 insertions(+), 10 deletions(-)
 create mode 100644 compatibility/__init__.py
 create mode 100644 compatibility/_apex_nccl_allocator.py
 create mode 100644 compatibility/amp_C.py
 create mode 100644 compatibility/apex_C.py
 create mode 100644 compatibility/bnp.py
 create mode 100644 compatibility/distributed_adam_cuda.py
 create mode 100644 compatibility/distributed_lamb_cuda.py
 create mode 100644 compatibility/fast_multihead_attn.py
 create mode 100644 compatibility/focal_loss_cuda.py
 create mode 100644 compatibility/fused_adam_cuda.py
 create mode 100644 compatibility/fused_bias_swiglu.py
 create mode 100644 compatibility/fused_dense_cuda.py
 create mode 100644 compatibility/fused_index_mul_2d.py
 create mode 100644 compatibility/fused_lamb_cuda.py
 create mode 100644 compatibility/fused_layer_norm_cuda.py
 create mode 100644 compatibility/fused_rotary_positional_embedding.py
 create mode 100644 compatibility/fused_weight_gradient_mlp_cuda.py
 create mode 100644 compatibility/generic_scaled_masked_softmax_cuda.py
 create mode 100644 compatibility/mlp_cuda.py
 create mode 100644 compatibility/nccl_p2p_cuda.py
 create mode 100644 compatibility/peer_memory_cuda.py
 create mode 100644 compatibility/scaled_masked-softmax_cuda.py
 create mode 100644 compatibility/scaled_softmax_cuda.py
 create mode 100644 compatibility/scaled_upper_triang_masked_softmax_cuda.py
 create mode 100644 compatibility/syncbn.py
 create mode 100644 compatibility/transducer_joint_cuda.py
 create mode 100644 compatibility/transducer_loss_cuda.py
 create mode 100644 compatibility/xentropy_cuda.py

diff --git a/apex/fused_dense/fused_dense.py b/apex/fused_dense/fused_dense.py
index 8f9812d20..97377a423 100644
--- a/apex/fused_dense/fused_dense.py
+++ b/apex/fused_dense/fused_dense.py
@@ -1,11 +1,9 @@
 import torch
 from torch import nn
-from apex.op_builder import FusedDenseBuilder
+import fused_dense_cuda
 from apex._autocast_utils import _cast_if_autocast_enabled
 import math 
 
-fused_dense_cuda = FusedDenseBuilder().load()
-
 #implements fused GEMM+bias in forward pass using mlp_cuda from apex
 class FusedDenseFunc(torch.autograd.Function):
     @staticmethod
diff --git a/compatibility/__init__.py b/compatibility/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/compatibility/_apex_nccl_allocator.py b/compatibility/_apex_nccl_allocator.py
new file mode 100644
index 000000000..6a029d1ee
--- /dev/null
+++ b/compatibility/_apex_nccl_allocator.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _ApexNcclAllocatorModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'NCCLAllocatorBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load _apex_nccl_allocator : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_") and name != "__class__":
+            raise AttributeError(f"module _apex_nccl_allocator has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module '_apex_nccl_allocator'>"
+
+sys.modules[__name__] = _ApexNcclAllocatorModule()
\ No newline at end of file
diff --git a/compatibility/amp_C.py b/compatibility/amp_C.py
new file mode 100644
index 000000000..f9257c596
--- /dev/null
+++ b/compatibility/amp_C.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _AmpCModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'AmpCBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load amp_C : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module amp_C has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'amp_C'>"
+
+sys.modules[__name__] = _AmpCModule()
\ No newline at end of file
diff --git a/compatibility/apex_C.py b/compatibility/apex_C.py
new file mode 100644
index 000000000..39bac5264
--- /dev/null
+++ b/compatibility/apex_C.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _ApexCModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'ApexCBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load apex_C : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module apex_C has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'apex_C'>"
+
+sys.modules[__name__] = _ApexCModule()
\ No newline at end of file
diff --git a/compatibility/bnp.py b/compatibility/bnp.py
new file mode 100644
index 000000000..b03ba798c
--- /dev/null
+++ b/compatibility/bnp.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _BnpModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'BnpBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load bnp : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module bnp has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'bnp'>"
+
+sys.modules[__name__] = _BnpModule()
\ No newline at end of file
diff --git a/compatibility/distributed_adam_cuda.py b/compatibility/distributed_adam_cuda.py
new file mode 100644
index 000000000..2566dce11
--- /dev/null
+++ b/compatibility/distributed_adam_cuda.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _DistributedAdamCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'DistributedAdamBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load distributed_adam_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module distributed_adam_cuda has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'distributed_adam_cuda'>"
+
+sys.modules[__name__] = _DistributedAdamCudaModule()
\ No newline at end of file
diff --git a/compatibility/distributed_lamb_cuda.py b/compatibility/distributed_lamb_cuda.py
new file mode 100644
index 000000000..7f0b64f3e
--- /dev/null
+++ b/compatibility/distributed_lamb_cuda.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _DistributedLambCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'DistributedLambBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load distributed_lamb_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module distributed_lamb_cuda has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'distributed_lamb_cuda'>"
+
+sys.modules[__name__] = _DistributedLambCudaModule()
\ No newline at end of file
diff --git a/compatibility/fast_multihead_attn.py b/compatibility/fast_multihead_attn.py
new file mode 100644
index 000000000..a9e060b87
--- /dev/null
+++ b/compatibility/fast_multihead_attn.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _FastMultiheadAttnModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'FastMultiheadAttnBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load fast_multihead_attn : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module fast_multihead_attn has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'fast_multihead_attn'>"
+
+sys.modules[__name__] = _FastMultiheadAttnModule()
\ No newline at end of file
diff --git a/compatibility/focal_loss_cuda.py b/compatibility/focal_loss_cuda.py
new file mode 100644
index 000000000..c7b364faf
--- /dev/null
+++ b/compatibility/focal_loss_cuda.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _FocalLossCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'FocalLossBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load focal_loss_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module focal_loss_cuda has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'focal_loss_cuda'>"
+
+sys.modules[__name__] = _FocalLossCudaModule()
\ No newline at end of file
diff --git a/compatibility/fused_adam_cuda.py b/compatibility/fused_adam_cuda.py
new file mode 100644
index 000000000..bf31ca739
--- /dev/null
+++ b/compatibility/fused_adam_cuda.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _FusedAdamCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'FusedAdamBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load fused_adam_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module fused_adam_cuda has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'fused_adam_cuda'>"
+
+sys.modules[__name__] = _FusedAdamCudaModule()
\ No newline at end of file
diff --git a/compatibility/fused_bias_swiglu.py b/compatibility/fused_bias_swiglu.py
new file mode 100644
index 000000000..e9f066f4a
--- /dev/null
+++ b/compatibility/fused_bias_swiglu.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _FusedBiasSwiGLUModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'FusedBiasSwiGLUBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load fused_bias_swiglu : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module fused_bias_swiglu has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'fused_bias_swiglu'>"
+
+sys.modules[__name__] = _FusedBiasSwiGLUModule()
\ No newline at end of file
diff --git a/compatibility/fused_dense_cuda.py b/compatibility/fused_dense_cuda.py
new file mode 100644
index 000000000..0d28badb2
--- /dev/null
+++ b/compatibility/fused_dense_cuda.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _FusedDenseCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'FusedDenseBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load fused_dense_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module fused_dense_cuda has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'fused_dense_cuda'>"
+
+sys.modules[__name__] = _FusedDenseCudaModule()
\ No newline at end of file
diff --git a/compatibility/fused_index_mul_2d.py b/compatibility/fused_index_mul_2d.py
new file mode 100644
index 000000000..c036877df
--- /dev/null
+++ b/compatibility/fused_index_mul_2d.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _FusedIndexMul2dModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'FusedIndexMul2dBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load fused_index_mul_2d : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module fused_index_mul_2d has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'fused_index_mul_2d'>"
+
+sys.modules[__name__] = _FusedIndexMul2dModule()
\ No newline at end of file
diff --git a/compatibility/fused_lamb_cuda.py b/compatibility/fused_lamb_cuda.py
new file mode 100644
index 000000000..3ab88d443
--- /dev/null
+++ b/compatibility/fused_lamb_cuda.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _FusedLambCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'FusedLambBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load fused_lamb_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module fused_lamb_cuda has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'fused_lamb_cuda'>"
+
+sys.modules[__name__] = _FusedLambCudaModule()
\ No newline at end of file
diff --git a/compatibility/fused_layer_norm_cuda.py b/compatibility/fused_layer_norm_cuda.py
new file mode 100644
index 000000000..2722e0252
--- /dev/null
+++ b/compatibility/fused_layer_norm_cuda.py
@@ -0,0 +1,44 @@
+import sys
+import importlib
+
+class _FusedLayerCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                #import the builder
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                mlp_builder = getattr(apex_op_builder, 'FusedLayerNormBuilder')
+
+                #load the module
+                self._loaded_module = mlp_builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load fused_layer_norm_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+    
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module fused_layer_norm_cuda has no attribute '{name}'")
+        
+        module = self._load_module()
+        return getattr(module, name)
+
+    def __dir__(self):
+        try:
+            module = self._load_module()
+            return dir(module)
+        except:
+            return []
+        
+    def __repr__(self):
+        return "<module 'fused_layer_norm_cuda'>"
+    
+#replace module with lazy loader
+sys.modules[__name__] = _FusedLayerCudaModule()
\ No newline at end of file
diff --git a/compatibility/fused_rotary_positional_embedding.py b/compatibility/fused_rotary_positional_embedding.py
new file mode 100644
index 000000000..d4f87bd33
--- /dev/null
+++ b/compatibility/fused_rotary_positional_embedding.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _FusedRotaryPositionalEmbeddingModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'FusedRopeBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load fused_rotary_positional_embedding : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module fused_rotary_positional_embedding has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'fused_rotary_positional_embedding'>"
+
+sys.modules[__name__] = _FusedRotaryPositionalEmbeddingModule()
\ No newline at end of file
diff --git a/compatibility/fused_weight_gradient_mlp_cuda.py b/compatibility/fused_weight_gradient_mlp_cuda.py
new file mode 100644
index 000000000..219d9355b
--- /dev/null
+++ b/compatibility/fused_weight_gradient_mlp_cuda.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _FusedWeightGradientMlpCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'FusedWeightGradientMlpCudaBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load fused_weight_gradient_mlp_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module fused_weight_gradient_mlp_cuda has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'fused_weight_gradient_mlp_cuda'>"
+
+sys.modules[__name__] = _FusedWeightGradientMlpCudaModule()
\ No newline at end of file
diff --git a/compatibility/generic_scaled_masked_softmax_cuda.py b/compatibility/generic_scaled_masked_softmax_cuda.py
new file mode 100644
index 000000000..fa50ca52c
--- /dev/null
+++ b/compatibility/generic_scaled_masked_softmax_cuda.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _GenericScaledMaskedSoftmaxCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'GenericScaledMaskedSoftmaxCudaBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load generic_scaled_masked_softmax_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module generic_scaled_masked_softmax_cuda has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'generic_scaled_masked_softmax_cuda'>"
+
+sys.modules[__name__] = _GenericScaledMaskedSoftmaxCudaModule()
\ No newline at end of file
diff --git a/compatibility/mlp_cuda.py b/compatibility/mlp_cuda.py
new file mode 100644
index 000000000..4c873d560
--- /dev/null
+++ b/compatibility/mlp_cuda.py
@@ -0,0 +1,44 @@
+import sys
+import importlib
+
+class _MLPCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                #import the builder
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                mlp_builder = getattr(apex_op_builder, 'MlpBuilder')
+
+                #load the module
+                self._loaded_module = mlp_builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load mlp_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+    
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module mlp_cuda has no attribute '{name}'")
+        
+        module = self._load_module()
+        return getattr(module, name)
+
+    def __dir__(self):
+        try:
+            module = self._load_module()
+            return dir(module)
+        except:
+            return []
+        
+    def __repr__(self):
+        return "<module 'mlp_cuda'>"
+    
+#replace module with lazy loader
+sys.modules[__name__] = _MLPCudaModule()
\ No newline at end of file
diff --git a/compatibility/nccl_p2p_cuda.py b/compatibility/nccl_p2p_cuda.py
new file mode 100644
index 000000000..d937cb95e
--- /dev/null
+++ b/compatibility/nccl_p2p_cuda.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _NcclP2pCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'NCCLP2PBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load nccl_p2p_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module nccl_p2p_cuda has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'nccl_p2p_cuda'>"
+
+sys.modules[__name__] = _NcclP2pCudaModule()
\ No newline at end of file
diff --git a/compatibility/peer_memory_cuda.py b/compatibility/peer_memory_cuda.py
new file mode 100644
index 000000000..d909ec1b9
--- /dev/null
+++ b/compatibility/peer_memory_cuda.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _PeerMemoryCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'PeerMemoryBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load peer_memory_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module peer_memory_cuda has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'peer_memory_cuda'>"
+
+sys.modules[__name__] = _PeerMemoryCudaModule()
\ No newline at end of file
diff --git a/compatibility/scaled_masked-softmax_cuda.py b/compatibility/scaled_masked-softmax_cuda.py
new file mode 100644
index 000000000..77ed74e47
--- /dev/null
+++ b/compatibility/scaled_masked-softmax_cuda.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _ScaledMaskedSoftmaxCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'ScaledMaskedSoftmaxCudaBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load scaled_masked_softmax_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module scaled_masked_softmax_cuda has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'scaled_masked_softmax_cuda'>"
+
+sys.modules[__name__] = _ScaledMaskedSoftmaxCudaModule()
\ No newline at end of file
diff --git a/compatibility/scaled_softmax_cuda.py b/compatibility/scaled_softmax_cuda.py
new file mode 100644
index 000000000..d7a4427e3
--- /dev/null
+++ b/compatibility/scaled_softmax_cuda.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _ScaledSoftmaxCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'ScaledSoftmaxCudaBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load scaled_softmax_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module scaled_softmax_cuda has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'scaled_softmax_cuda'>"
+
+sys.modules[__name__] = _ScaledSoftmaxCudaModule()
\ No newline at end of file
diff --git a/compatibility/scaled_upper_triang_masked_softmax_cuda.py b/compatibility/scaled_upper_triang_masked_softmax_cuda.py
new file mode 100644
index 000000000..8da9b5c67
--- /dev/null
+++ b/compatibility/scaled_upper_triang_masked_softmax_cuda.py
@@ -0,0 +1,38 @@
+import sys
+import importlib
+
+class _ScaledUpperTriangMaskedSoftmaxCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                name = 'ScaledUpperTriangMaskedSoftmaxCudaBuilder'
+                builder = getattr(apex_op_builder, name)
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load scaled_upper_triang_masked_softmax_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name_attr):
+        if name_attr.startswith("_"):
+            raise AttributeError(f"module scaled_upper_triang_masked_softmax_cuda has no attribute '{name_attr}'")
+        return getattr(self._load_module(), name_attr)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'scaled_upper_triang_masked_softmax_cuda'>"
+
+sys.modules[__name__] = _ScaledUpperTriangMaskedSoftmaxCudaModule()
\ No newline at end of file
diff --git a/compatibility/syncbn.py b/compatibility/syncbn.py
new file mode 100644
index 000000000..b619575dc
--- /dev/null
+++ b/compatibility/syncbn.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _SyncbnModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'SyncBnBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load syncbn : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module syncbn has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'syncbn'>"
+
+sys.modules[__name__] = _SyncbnModule()
\ No newline at end of file
diff --git a/compatibility/transducer_joint_cuda.py b/compatibility/transducer_joint_cuda.py
new file mode 100644
index 000000000..e06705fde
--- /dev/null
+++ b/compatibility/transducer_joint_cuda.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _TransducerJointCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'TransducerJointBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load transducer_joint_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module transducer_joint_cuda has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'transducer_joint_cuda'>"
+
+sys.modules[__name__] = _TransducerJointCudaModule()
\ No newline at end of file
diff --git a/compatibility/transducer_loss_cuda.py b/compatibility/transducer_loss_cuda.py
new file mode 100644
index 000000000..d5a2c0f36
--- /dev/null
+++ b/compatibility/transducer_loss_cuda.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _TransducerLossCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'TransducerLossBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load transducer_loss_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module transducer_loss_cuda has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'transducer_loss_cuda'>"
+
+sys.modules[__name__] = _TransducerLossCudaModule()
\ No newline at end of file
diff --git a/compatibility/xentropy_cuda.py b/compatibility/xentropy_cuda.py
new file mode 100644
index 000000000..ff4dc9733
--- /dev/null
+++ b/compatibility/xentropy_cuda.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _XentropyCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'XentropyBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load xentropy_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module xentropy_cuda has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'xentropy_cuda'>"
+
+sys.modules[__name__] = _XentropyCudaModule()
\ No newline at end of file
diff --git a/op_builder/builder.py b/op_builder/builder.py
index 22bd091a9..6784f17e0 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -865,12 +865,7 @@ def nccl_args(self):
         return nccl_library
 
     def nccl_version(self):
-        from torch.utils.cpp_extension import load
-        _nccl_version_getter = load(
-            name="_nccl_version_getter",
-            sources=["contrib/csrc/nccl_p2p/nccl_version.cpp", "contrib/csrc/nccl_p2p/nccl_version_check.cu"],
-        )
-        return _nccl_version_getter.get_nccl_version()
+        return torch.cuda.nccl.version()[0:2]
 
     def torch_version(self):
         return (TORCH_MAJOR, TORCH_MINOR)
diff --git a/setup.py b/setup.py
index 23cfb4b0b..2614fa3df 100644
--- a/setup.py
+++ b/setup.py
@@ -23,6 +23,7 @@
 sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
 
 from op_builder.all_ops import ALL_OPS
+import shutil
 
 # ninja build does not work unless include_dirs are abs path
 this_dir = os.path.dirname(os.path.abspath(__file__))
@@ -268,6 +269,25 @@ def is_op_build_included(op_name):
 with open('requirements.txt') as f:
     required = f.read().splitlines()
 
+# Find python files in compatibility folder
+compatibility_dir = os.path.join(this_dir, 'compatibility')
+py_modules = []
+
+if os.path.exists(compatibility_dir):
+    for file in os.listdir(compatibility_dir):
+        if file.endswith('.py') and file != '__init__.py':
+            module_name = f"{file[:-3]}"
+            py_modules.append(module_name)
+
+            #copy outside temporarily
+            src_file = os.path.join(compatibility_dir, file)
+            dst_file = os.path.join(this_dir, file)
+            shutil.copy2(src_file, dst_file)
+else:
+    print("Warning: compatibility folder not found")
+
+print ("-----py_modules--------", py_modules)
+
 setup(
     name="apex",
     version=get_apex_version(),
@@ -279,5 +299,12 @@ def is_op_build_included(op_name):
     cmdclass={'build_ext': BuildExtension} if ext_modules else {},
     extras_require=extras,
     install_requires=required,
-    include_package_data=True
+    include_package_data=True,
+    py_modules=py_modules
 )
+
+#delete the temporarily copied compatibility files
+for py_module in py_modules:
+    path = dst_file = os.path.join(this_dir, py_module + ".py")
+    if os.path.exists(path):
+        os.remove(path)
\ No newline at end of file

From 87ae01c204e515d6eec23044ee911319b8d1139f Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Tue, 12 Aug 2025 21:17:02 +0000
Subject: [PATCH 58/79] Restore to original importing the extension code.

---
 apex/amp/_process_optimizer.py                |    7 +-
 apex/amp/scaler.py                            |    8 +-
 apex/contrib/clip_grad/clip_grad.py           |    6 +-
 apex/contrib/focal_loss/focal_loss.py         |    5 +-
 apex/contrib/groupbn/__init__.py              |    3 +-
 apex/contrib/groupbn/batch_norm.py            |    3 +-
 apex/contrib/index_mul_2d/index_mul_2d.py     |    4 +-
 .../fast_encdec_multihead_attn_func.py        |    4 +-
 ...ast_encdec_multihead_attn_norm_add_func.py |    3 +-
 .../fast_self_multihead_attn_func.py          |    3 +-
 .../fast_self_multihead_attn_norm_add_func.py |    3 +-
 .../mask_softmax_dropout_func.py              |    3 +-
 apex/contrib/nccl_allocator/nccl_allocator.py |    3 +-
 .../optimizers/distributed_fused_adam.py      |   14 +-
 .../optimizers/distributed_fused_lamb.py      |   19 +-
 apex/contrib/optimizers/fp16_optimizer.py     |    7 +-
 apex/contrib/optimizers/fused_adam.py         |    7 +-
 apex/contrib/optimizers/fused_lamb.py         |   10 +-
 apex/contrib/optimizers/fused_sgd.py          |    9 +-
 .../peer_halo_exchange_module_tests.py        |    3 +-
 .../peer_memory/peer_halo_exchanger_1d.py     |    3 +-
 apex/contrib/peer_memory/peer_memory.py       |    3 +-
 apex/contrib/transducer/transducer.py         |    5 +-
 apex/contrib/xentropy/__init__.py             |    4 +-
 apex/contrib/xentropy/softmax_xentropy.py     |    4 +-
 apex/fp16_utils/fp16_optimizer.py             |    7 +-
 apex/mlp/mlp.py                               |    3 +-
 apex/multi_tensor_apply/__init__.py           |    6 +-
 apex/multi_tensor_apply/multi_tensor_apply.py |    3 +-
 apex/normalization/fused_layer_norm.py        |   25 +-
 apex/optimizers/fused_adagrad.py              |    7 +-
 apex/optimizers/fused_adam.py                 |    7 +-
 apex/optimizers/fused_lamb.py                 |    9 +-
 apex/optimizers/fused_lars.py                 |    7 +-
 apex/optimizers/fused_mixed_precision_lamb.py |   11 +-
 apex/optimizers/fused_novograd.py             |    7 +-
 apex/optimizers/fused_sgd.py                  |    7 +-
 apex/parallel/__init__.py                     |    1 +
 apex/parallel/distributed.py                  |   10 +-
 apex/parallel/optimized_sync_batchnorm.py     |    3 +
 .../optimized_sync_batchnorm_kernel.py        |    7 +-
 apex/transformer/functional/fused_rope.py     |   12 +-
 apex/transformer/functional/fused_softmax.py  |   25 +-
 apex/transformer/pipeline_parallel/utils.py   |    8 +-
 apex/transformer/tensor_parallel/layers.py    |    3 +-
 examples/imagenet/main_amp.py                 |    1 +
 tests/L0/log_test.txt                         | 4959 -----------------
 tests/L0/run_amp/test_fused_sgd.py            |    3 +-
 tests/L0/run_amp/test_multi_tensor_axpby.py   |    3 +-
 tests/L0/run_amp/test_multi_tensor_l2norm.py  |    3 +-
 tests/L0/run_amp/test_multi_tensor_scale.py   |    3 +-
 tests/L0/run_optimizers/test_lamb.py          |    7 +-
 .../run_transformer/test_fused_bias_swiglu.py |    3 +-
 tests/L1/common/main_amp.py                   |    3 +-
 .../synced_batchnorm/single_gpu_unit_test.py  |    5 +-
 .../synced_batchnorm/test_groups.py           |    3 +-
 .../synced_batchnorm/two_gpu_unit_test.py     |    3 +-
 57 files changed, 120 insertions(+), 5187 deletions(-)
 delete mode 100644 tests/L0/log_test.txt

diff --git a/apex/amp/_process_optimizer.py b/apex/amp/_process_optimizer.py
index 5f7ef4c9d..66c4c3fdf 100644
--- a/apex/amp/_process_optimizer.py
+++ b/apex/amp/_process_optimizer.py
@@ -1,6 +1,6 @@
 import types
 from ..fp16_utils import master_params_to_model_params
-from ..multi_tensor_apply import MultiTensorApply
+from ..multi_tensor_apply import multi_tensor_applier
 from ._amp_state import maybe_print, _amp_state
 import torch
 from ..optimizers import FusedSGD
@@ -13,7 +13,6 @@ def __init__(self):
 
 def _master_params_to_model_params(self):
     stash = self._amp_stash
-    multi_tensor_applier = MultiTensorApply(256*32)
     if multi_tensor_applier.available:
         if len(stash.all_fp16_params) > 0:
             multi_tensor_applier(
@@ -320,7 +319,6 @@ def _amp_lazy_init(self):
 
 
 def _process_optimizer(optimizer, properties):
-    multi_tensor_applier = MultiTensorApply(256*32)
     if hasattr(optimizer, "_amp_stash"):
         raise RuntimeError("A given optimizer should only be passed through amp.initialize once.")
     else:
@@ -340,8 +338,7 @@ def _process_optimizer(optimizer, properties):
 
     # TODO:  Centralize exposure and import error checking for the C backend.
     if multi_tensor_applier.available:
-        from apex.op_builder import AmpCBuilder
-        amp_C = AmpCBuilder().load()
+        import amp_C
         optimizer._amp_stash.multi_tensor_scale = amp_C.multi_tensor_scale
         optimizer._amp_stash.multi_tensor_l2norm = amp_C.multi_tensor_l2norm
         optimizer._amp_stash.dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device='cuda')
diff --git a/apex/amp/scaler.py b/apex/amp/scaler.py
index 33e431e71..c11f70398 100644
--- a/apex/amp/scaler.py
+++ b/apex/amp/scaler.py
@@ -1,5 +1,5 @@
 import torch
-from apex.multi_tensor_apply import MultiTensorApply
+from ..multi_tensor_apply import multi_tensor_applier
 from ._amp_state import _amp_state, master_params, maybe_print
 from itertools import product
 
@@ -63,10 +63,8 @@ def __init__(self,
         self._unskipped = 0
         self._has_overflow = False
         self._overflow_buf = torch.tensor([0], dtype=torch.int, device='cuda')
-        multi_tensor_applier = MultiTensorApply(256*32)
         if multi_tensor_applier.available:
-            from apex.op_builder import AmpCBuilder
-            amp_C = AmpCBuilder().load()
+            import amp_C
             LossScaler.has_fused_kernel = multi_tensor_applier.available
             LossScaler.multi_tensor_scale_cuda = amp_C.multi_tensor_scale
             LossScaler.multi_tensor_axpby_cuda = amp_C.multi_tensor_axpby
@@ -103,7 +101,6 @@ def unscale_python(self, model_grads, master_grads, scale):
 
     # unused_scale keeps some of the old API alive for hopefully a short time.
     def unscale(self, model_grads, master_grads, unused_scale, models_are_masters=False, scale_override=None):
-        multi_tensor_applier = MultiTensorApply(256*32)
         if self._has_overflow:
             return
 
@@ -166,7 +163,6 @@ def unscale_with_stashed(self,
                              stashed_master_grads,
                              master_grads,
                              scale_override=None):
-        multi_tensor_applier = MultiTensorApply(256*32)
         if self._has_overflow:
             return
 
diff --git a/apex/contrib/clip_grad/clip_grad.py b/apex/contrib/clip_grad/clip_grad.py
index 931786206..b6411352b 100644
--- a/apex/contrib/clip_grad/clip_grad.py
+++ b/apex/contrib/clip_grad/clip_grad.py
@@ -4,9 +4,8 @@
 
 _kernel_import_succeeded = False
 try:
-    from apex.op_builder import AmpCBuilder
-    amp_C = AmpCBuilder().load()
-    from apex.multi_tensor_apply import MultiTensorApply
+    import amp_C
+    from apex.multi_tensor_apply import multi_tensor_applier
     _kernel_import_succeeded = True
 except ImportError:
     _kernel_import_succeeded = False
@@ -76,7 +75,6 @@ def clip_grad_norm_(
     # Compute gradient L2 norms
     norms = []
     dummy_overflow_buf = torch.zeros([1], dtype=torch.int32, device=device)
-    multi_tensor_applier = MultiTensorApply(256*32)
     if grads_fp32:
         norms.append(
             multi_tensor_applier(
diff --git a/apex/contrib/focal_loss/focal_loss.py b/apex/contrib/focal_loss/focal_loss.py
index a63c43103..85c6f620e 100644
--- a/apex/contrib/focal_loss/focal_loss.py
+++ b/apex/contrib/focal_loss/focal_loss.py
@@ -1,6 +1,7 @@
 import torch
-from apex.op_builder import FocalLossBuilder
-focal_loss_cuda = FocalLossBuilder().load()
+
+import focal_loss_cuda
+
 
 class FocalLoss(torch.autograd.Function):
     @staticmethod
diff --git a/apex/contrib/groupbn/__init__.py b/apex/contrib/groupbn/__init__.py
index 9ab407c01..2f8577066 100644
--- a/apex/contrib/groupbn/__init__.py
+++ b/apex/contrib/groupbn/__init__.py
@@ -1,7 +1,6 @@
 try:
     import torch
-    from apex.op_builder import BnpBuilder
-    bnp = BnpBuilder().load()
+    import bnp
     from .batch_norm import BatchNorm2d_NHWC
     del torch
     del bnp
diff --git a/apex/contrib/groupbn/batch_norm.py b/apex/contrib/groupbn/batch_norm.py
index b7bc79676..af0b7e9b2 100644
--- a/apex/contrib/groupbn/batch_norm.py
+++ b/apex/contrib/groupbn/batch_norm.py
@@ -2,8 +2,7 @@
 import numpy as np
 from torch.nn.modules.batchnorm import _BatchNorm
 
-from apex.op_builder import BnpBuilder
-bnp = BnpBuilder().load()
+import bnp
 
 def check_if_rocm_pytorch():
     is_rocm_pytorch = False
diff --git a/apex/contrib/index_mul_2d/index_mul_2d.py b/apex/contrib/index_mul_2d/index_mul_2d.py
index 3ecfff888..1d34fe20c 100644
--- a/apex/contrib/index_mul_2d/index_mul_2d.py
+++ b/apex/contrib/index_mul_2d/index_mul_2d.py
@@ -1,6 +1,6 @@
 import torch
-from apex.op_builder import FusedIndexMul2dBuilder  
-fused_index_mul_2d = FusedIndexMul2dBuilder().load()
+
+import fused_index_mul_2d
 
 class IndexMul2d_(torch.autograd.Function):
     '''
diff --git a/apex/contrib/multihead_attn/fast_encdec_multihead_attn_func.py b/apex/contrib/multihead_attn/fast_encdec_multihead_attn_func.py
index ba6d865ca..9431a4936 100644
--- a/apex/contrib/multihead_attn/fast_encdec_multihead_attn_func.py
+++ b/apex/contrib/multihead_attn/fast_encdec_multihead_attn_func.py
@@ -1,7 +1,7 @@
 import torch
 
-from apex.op_builder import FastMultiheadAttnBuilder  
-fast_multihead_attn = FastMultiheadAttnBuilder().load()
+import fast_multihead_attn
+
 
 class FastEncdecAttnFunc(torch.autograd.Function):
     @staticmethod
diff --git a/apex/contrib/multihead_attn/fast_encdec_multihead_attn_norm_add_func.py b/apex/contrib/multihead_attn/fast_encdec_multihead_attn_norm_add_func.py
index 516e888a9..320bebd66 100644
--- a/apex/contrib/multihead_attn/fast_encdec_multihead_attn_norm_add_func.py
+++ b/apex/contrib/multihead_attn/fast_encdec_multihead_attn_norm_add_func.py
@@ -7,8 +7,7 @@
 
 import torch
 
-from apex.op_builder import FastMultiheadAttnBuilder  
-fast_multihead_attn = FastMultiheadAttnBuilder().load()
+import fast_multihead_attn
 
 
 class FastEncdecAttnNormAddFunc(torch.autograd.Function):
diff --git a/apex/contrib/multihead_attn/fast_self_multihead_attn_func.py b/apex/contrib/multihead_attn/fast_self_multihead_attn_func.py
index 9cbe22a58..6b50fe227 100644
--- a/apex/contrib/multihead_attn/fast_self_multihead_attn_func.py
+++ b/apex/contrib/multihead_attn/fast_self_multihead_attn_func.py
@@ -1,7 +1,6 @@
 import torch
 
-from apex.op_builder import FastMultiheadAttnBuilder  
-fast_multihead_attn = FastMultiheadAttnBuilder().load()
+import fast_multihead_attn
 
 
 class FastSelfAttnFunc(torch.autograd.Function):
diff --git a/apex/contrib/multihead_attn/fast_self_multihead_attn_norm_add_func.py b/apex/contrib/multihead_attn/fast_self_multihead_attn_norm_add_func.py
index f0a0dc2a8..7f110cb33 100644
--- a/apex/contrib/multihead_attn/fast_self_multihead_attn_norm_add_func.py
+++ b/apex/contrib/multihead_attn/fast_self_multihead_attn_norm_add_func.py
@@ -1,7 +1,6 @@
 import torch
 
-from apex.op_builder import FastMultiheadAttnBuilder  
-fast_multihead_attn = FastMultiheadAttnBuilder().load()
+import fast_multihead_attn
 
 
 class FastSelfAttnNormAddFunc(torch.autograd.Function):
diff --git a/apex/contrib/multihead_attn/mask_softmax_dropout_func.py b/apex/contrib/multihead_attn/mask_softmax_dropout_func.py
index e92703b21..b34eec444 100644
--- a/apex/contrib/multihead_attn/mask_softmax_dropout_func.py
+++ b/apex/contrib/multihead_attn/mask_softmax_dropout_func.py
@@ -1,7 +1,6 @@
 import torch
 
-from apex.op_builder import FastMultiheadAttnBuilder  
-fast_multihead_attn = FastMultiheadAttnBuilder().load()
+import fast_multihead_attn
 
 
 class MaskSoftmaxDropout(torch.autograd.Function):
diff --git a/apex/contrib/nccl_allocator/nccl_allocator.py b/apex/contrib/nccl_allocator/nccl_allocator.py
index 0700f907f..62fcee756 100644
--- a/apex/contrib/nccl_allocator/nccl_allocator.py
+++ b/apex/contrib/nccl_allocator/nccl_allocator.py
@@ -1,7 +1,6 @@
 import os
 import torch
-from apex.op_builder import NCCLAllocatorBuilder
-_apex_nccl_allocator = NCCLAllocatorBuilder().load()
+import _apex_nccl_allocator
 
 from contextlib import nullcontext
 
diff --git a/apex/contrib/optimizers/distributed_fused_adam.py b/apex/contrib/optimizers/distributed_fused_adam.py
index 18a5779eb..65da11218 100644
--- a/apex/contrib/optimizers/distributed_fused_adam.py
+++ b/apex/contrib/optimizers/distributed_fused_adam.py
@@ -27,10 +27,9 @@
 except ImportError:
     nccl_allocator = None
 
-from apex.multi_tensor_apply import MultiTensorApply
-from apex.op_builder import AmpCBuilder, DistributedAdamBuilder, FusedAdamBuilder
-amp_C = AmpCBuilder().load()
-distributed_adam_cuda = DistributedAdamBuilder().load()
+from apex.multi_tensor_apply import multi_tensor_applier
+import amp_C
+import distributed_adam_cuda
 
 # Fallback to private functions if using PyTorch <1.13.0
 try:
@@ -127,7 +126,7 @@ def _coalescing_manager_append_work(
 # Import optional CUDA kernels
 _FOUND_DEPRECATED_FUSED_ADAM: bool = False
 try:
-    fused_adam_cuda = FusedAdamBuilder().load()
+    import fused_adam_cuda
 
     _FOUND_DEPRECATED_FUSED_ADAM = True
 except ImportError:
@@ -211,7 +210,6 @@ def _multi_tensor_copy(
         use_fused_kernel = use_fused_kernel and is_cuda and is_contiguous
 
         # Copy buffers
-        multi_tensor_applier = MultiTensorApply(256*32)
         if use_fused_kernel and _FOUND_DEPRECATED_FUSED_ADAM:
             if dummy_overflow_buf is None:
                 dummy_overflow_buf = torch.zeros([1], dtype=torch.int32, device="cuda")
@@ -2266,7 +2264,6 @@ def _local_grad_norm(
                     )
 
         # Compute norm of each group of grads
-        multi_tensor_applier = MultiTensorApply(256*32)
         grad_norm_sq = None
         for grad_group in grad_groups.values():
             grad_group_norm_sq = (
@@ -2665,7 +2662,6 @@ def _local_step(self, bucket_ids: List[int]) -> None:
         # Apply optimizer step to each param group
         adam_func = distributed_adam_cuda.multi_tensor_fused_adam_capturable \
             if self.capturable else distributed_adam_cuda.multi_tensor_fused_adam
-        multi_tensor_applier = MultiTensorApply(256*32)
         for (group_id, _, _, _), group_buffers in buffers.items():
             group = self.param_groups[group_id]
             beta1, beta2 = group["betas"]
@@ -2753,7 +2749,6 @@ def _local_step_with_param_remainders(
                 )
 
         # Apply optimizer step to each param group
-        multi_tensor_applier = MultiTensorApply(256*32)
         for (group_id, _), group_buffers in buffers.items():
             group = self.param_groups[group_id]
             beta1, beta2 = group["betas"]
@@ -2837,7 +2832,6 @@ def _local_step_with_scaled_states(
                 buf.mul_(scale)
 
             # Apply optimizer step to each param group
-            multi_tensor_applier = MultiTensorApply(256*32)
             for group_id, buffers in group_buffers.items():
                 group = self.param_groups[group_id]
                 beta1, beta2 = group["betas"]
diff --git a/apex/contrib/optimizers/distributed_fused_lamb.py b/apex/contrib/optimizers/distributed_fused_lamb.py
index 4df7266c1..0925bd04a 100644
--- a/apex/contrib/optimizers/distributed_fused_lamb.py
+++ b/apex/contrib/optimizers/distributed_fused_lamb.py
@@ -3,9 +3,8 @@
 import inspect
 import torch
 import importlib
-from apex.op_builder import AmpCBuilder, DistributedLambBuilder, FusedAdamBuilder
-amp_C = AmpCBuilder().load()
-from apex.multi_tensor_apply import MultiTensorApply
+import amp_C
+from apex.multi_tensor_apply import multi_tensor_applier
 
 import torch.distributed.distributed_c10d as c10d
 
@@ -114,14 +113,14 @@ def __init__(self, params,
         super(DistributedFusedLAMB, self).__init__(params, defaults)
 
         global fused_adam_cuda, distributed_lamb_cuda
-        fused_adam_cuda = FusedAdamBuilder().load()
-        distributed_lamb_cuda = DistributedLambBuilder().load()
+        fused_adam_cuda = importlib.import_module("fused_adam_cuda")
+        distributed_lamb_cuda = importlib.import_module("distributed_lamb_cuda")
 
         self._overflow_buf = torch.cuda.IntTensor([0])
         self._has_overflow = False
         self.multi_tensor_lamb_compute_update_term = distributed_lamb_cuda.multi_tensor_lamb_compute_update_term
         self.multi_tensor_lamb_update_weights = distributed_lamb_cuda.multi_tensor_lamb_update_weights
-        amp_C = AmpCBuilder().load()
+        import amp_C
         self.multi_tensor_l2norm = amp_C.multi_tensor_l2norm
 
         self._grad_averaging = grad_averaging
@@ -735,8 +734,7 @@ def _pipeline_block_reductions(self, block_id):
             else:
                 self._reduce_scatter_and_all_reduce(block_id)
 
-            # Compute L2 grad nor
-            multi_tensor_applier = MultiTensorApply(256*32)
+            # Compute L2 grad norm
             if block_id == 0:
                 with torch.cuda.stream(self._l2_grad_norm_st):
                     for block_id in range(self._num_blocks):
@@ -791,7 +789,6 @@ def _pipeline_block_reductions(self, block_id):
                         self._reductions_works[block_id][chunk_id].wait()
 
     def __compute_contrib_param_norm(self):
-        multi_tensor_applier = MultiTensorApply(256*32)
         if self._contrib_model_param_for_norm_fp16 is not None and self._contrib_model_param_for_norm_fp32 is not None:
             gnorm_fp16 = multi_tensor_applier(self.multi_tensor_l2norm, self._overflow_buf, [self._contrib_model_param_for_norm_fp16], True)[1]
             gnorm_fp32 = multi_tensor_applier(self.multi_tensor_l2norm, self._overflow_buf, [self._contrib_model_param_for_norm_fp32], True)[1]
@@ -805,7 +802,6 @@ def __compute_contrib_param_norm(self):
         return gnorm
 
     def __compute_contrib_update_norm(self):
-        multi_tensor_applier = MultiTensorApply(256*32)
         l2_norm = torch.zeros(size=[self._model_params_num], dtype=torch.float32, device='cuda')
         local_contrib_l2_norm = multi_tensor_applier(self.multi_tensor_l2norm, self._overflow_buf, [self._contrib_update_frag_for_norm], True)[1] ** 2
         l2_norm.scatter_(dim=0, index=self._offsets, src=local_contrib_l2_norm)
@@ -814,7 +810,6 @@ def __compute_contrib_update_norm(self):
         return l2_norm
 
     def _pipeline_step(self):
-        multi_tensor_applier = MultiTensorApply(256*32)
         global_scale = self.global_scale
         # if clip before ar, set max_grad_norm to 0
         max_grad_norm = self.defaults['max_grad_norm'] * self._clip_after_ar
@@ -904,7 +899,6 @@ def _pipeline_step(self):
                         )
 
     def _flatten_grad_mt(self, scale):
-        multi_tensor_applier = MultiTensorApply(256*32)
         if len(self._grads_fp16) > 0:
             self._overflow_buf.zero_()
             if not self._fused_norm:
@@ -991,7 +985,6 @@ def complete_reductions(self):
         self._grads_generated = [False]*len(self._grads_info)
 
     def step(self, closure=None, grad_scaler=None):
-        multi_tensor_applier = MultiTensorApply(256*32)
         loss = None
         if closure is not None:
             loss = closure()
diff --git a/apex/contrib/optimizers/fp16_optimizer.py b/apex/contrib/optimizers/fp16_optimizer.py
index b6338c8c5..0cbb63b82 100755
--- a/apex/contrib/optimizers/fp16_optimizer.py
+++ b/apex/contrib/optimizers/fp16_optimizer.py
@@ -1,5 +1,5 @@
 import torch
-from apex.multi_tensor_apply import MultiTensorApply
+from apex.multi_tensor_apply import multi_tensor_applier
 
 class FP16_Optimizer(object):
     """
@@ -53,10 +53,8 @@ def __init__(self,
             self.fp32_groups.append(fp32_group)
             param_group['params'] = fp32_group
 
-        multi_tensor_applier = MultiTensorApply(256*32)
         if multi_tensor_applier.available:
-            from apex.op_builder import AmpCBuilder
-            amp_C = AmpCBuilder().load()
+            import amp_C
             self.overflow_buf = torch.cuda.IntTensor([0])
             self.multi_tensor_l2norm=amp_C.multi_tensor_l2norm
         else:
@@ -109,7 +107,6 @@ def step(self, closure=None):
         
         # nan check
         self.overflow_buf.zero_()
-        multi_tensor_applier = MultiTensorApply(256*32)
         for fp16_grad in fp16_grads:
             if len(fp16_grad) > 0:
                 norm, norm_per_tensor = multi_tensor_applier(self.multi_tensor_l2norm,
diff --git a/apex/contrib/optimizers/fused_adam.py b/apex/contrib/optimizers/fused_adam.py
index 379399ee2..a823e7be6 100644
--- a/apex/contrib/optimizers/fused_adam.py
+++ b/apex/contrib/optimizers/fused_adam.py
@@ -1,7 +1,7 @@
 import types
 import torch
 import importlib
-from apex.multi_tensor_apply import MultiTensorApply
+from apex.multi_tensor_apply import multi_tensor_applier
 
 class FusedAdam(torch.optim.Optimizer):
 
@@ -41,12 +41,10 @@ def __init__(self, params,
                  weight_decay=0., max_grad_norm=0., amsgrad=False, use_mt=False,
                  amp_scale_adjustment=1.0):
         global fused_adam_cuda
-        from apex.op_builder import FusedAdamBuilder
-        fused_adam_cuda = FusedAdamBuilder().load()
+        fused_adam_cuda = importlib.import_module("fused_adam_cuda")
 
         self._use_multi_tensor = False
         if use_mt:
-            multi_tensor_applier = MultiTensorApply(256*32)
             if not multi_tensor_applier.available:
                 print("Warning:  multi_tensor_applier is unavailable")
             else:
@@ -190,7 +188,6 @@ def step(self, closure=None, grads=None, output_params=None, scale=1., grad_norm
                                              group['weight_decay'])
 
             if self._use_multi_tensor:
-                multi_tensor_applier = MultiTensorApply(256*32)
                 with torch.cuda.device(tensordevice):
                     multi_tensor_applier(
                         fused_adam_cuda.adam_mt,
diff --git a/apex/contrib/optimizers/fused_lamb.py b/apex/contrib/optimizers/fused_lamb.py
index 4ca301682..81d868228 100644
--- a/apex/contrib/optimizers/fused_lamb.py
+++ b/apex/contrib/optimizers/fused_lamb.py
@@ -1,7 +1,7 @@
 import torch
 import importlib
 import math
-from apex.multi_tensor_apply import MultiTensorApply
+from apex.multi_tensor_apply import multi_tensor_applier
 
 class FusedLAMB(torch.optim.Optimizer):
 
@@ -72,14 +72,11 @@ def __init__(self, params, lr=1e-3, bias_correction=True,
                         grad_averaging=grad_averaging,
                         max_grad_norm=max_grad_norm)
         super(FusedLAMB, self).__init__(params, defaults)
-        multi_tensor_applier = MultiTensorApply(256*32)
         if multi_tensor_applier.available:
-            from apex.op_builder import AmpCBuilder
-            amp_C = AmpCBuilder().load()
+            import amp_C
             self.multi_tensor_l2norm=amp_C.multi_tensor_l2norm
             self._dummy_overflow_buf = torch.cuda.IntTensor([0])
-            from apex.op_builder import FusedLambBuilder
-            fused_lamb_cuda = FusedLambBuilder().load()
+            fused_lamb_cuda = importlib.import_module("fused_lamb_cuda")
             self.multi_tensor_lamb = fused_lamb_cuda.lamb
         else:
             raise RuntimeError('apex.contrib.optimizers.FusedLAMB requires cuda extensions')
@@ -121,7 +118,6 @@ def step(self, closure=None):
 
         g_norm_32, g_norm_16 = 0.0, 0.0
         # compute grad norm for two lists
-        multi_tensor_applier = MultiTensorApply(256*32)
         if len(g_all_32) > 0:
             g_norm_32 = multi_tensor_applier(self.multi_tensor_l2norm,
                                              self._dummy_overflow_buf,
diff --git a/apex/contrib/optimizers/fused_sgd.py b/apex/contrib/optimizers/fused_sgd.py
index de93e313e..83587c6a6 100644
--- a/apex/contrib/optimizers/fused_sgd.py
+++ b/apex/contrib/optimizers/fused_sgd.py
@@ -2,7 +2,7 @@
 import torch
 from torch.optim.optimizer import Optimizer, required
 
-from apex.multi_tensor_apply import MultiTensorApply
+from apex.multi_tensor_apply import multi_tensor_applier
 
 class FusedSGD(Optimizer):
     r"""Implements stochastic gradient descent (optionally with momentum).
@@ -82,10 +82,8 @@ def __init__(self, params, lr=required, momentum=0, dampening=0,
 
         self.wd_after_momentum = wd_after_momentum
 
-        multi_tensor_applier = MultiTensorApply(256*32)
         if multi_tensor_applier.available:
-            from apex.op_builder import AmpCBuilder
-            amp_C = AmpCBuilder().load()
+            import amp_C
             # Skip buffer
             self._dummy_overflow_buf = torch.cuda.IntTensor([0])
             self.multi_tensor_sgd = amp_C.multi_tensor_sgd
@@ -192,12 +190,11 @@ def step(self, closure=None, grads=None, output_params=None, scale=1., grad_norm
             fp16_set = [fp16_grads, fp32_from_fp16_params, fp32_from_fp16_momentums, fp16_params]
 
             launch_sets = [fp16_set, fp32_set]
-            multi_tensor_applier = MultiTensorApply(256*32)
 
             for launch_set, first_run in zip(launch_sets, first_runs):
                 assert len(launch_set[0]) == len(launch_set[1])
                 assert len(launch_set[0]) == len(launch_set[2])
-                if len(launch_set[0]) > 0: 
+                if len(launch_set[0]) > 0:
                     multi_tensor_applier(
                         self.multi_tensor_sgd,
                         self._dummy_overflow_buf,
diff --git a/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py b/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py
index 135482186..bd85354af 100644
--- a/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py
+++ b/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py
@@ -1,7 +1,6 @@
 import torch
 from apex.contrib.peer_memory import PeerMemoryPool, PeerHaloExchanger1d
-from apex.op_builder import PeerMemoryBuilder
-pm = PeerMemoryBuilder().load()
+import peer_memory_cuda as pm
 
 # How to run:
 # torchrun --nproc_per_node <num-GPU> <this-python-prog>
diff --git a/apex/contrib/peer_memory/peer_halo_exchanger_1d.py b/apex/contrib/peer_memory/peer_halo_exchanger_1d.py
index 773ca6f4b..cc25693ce 100644
--- a/apex/contrib/peer_memory/peer_halo_exchanger_1d.py
+++ b/apex/contrib/peer_memory/peer_halo_exchanger_1d.py
@@ -1,7 +1,6 @@
 import torch
 from apex.contrib.peer_memory import PeerMemoryPool
-from apex.op_builder import PeerMemoryBuilder
-pm = PeerMemoryBuilder().load()
+import peer_memory_cuda as pm
 
 class PeerHaloExchanger1d:
     def __init__(self, ranks, rank_in_group, peer_pool, half_halo):
diff --git a/apex/contrib/peer_memory/peer_memory.py b/apex/contrib/peer_memory/peer_memory.py
index 5f9e07773..adb218219 100644
--- a/apex/contrib/peer_memory/peer_memory.py
+++ b/apex/contrib/peer_memory/peer_memory.py
@@ -1,7 +1,6 @@
 import torch
 import numpy as np
-from apex.op_builder import PeerMemoryBuilder
-pm = PeerMemoryBuilder().load()
+import peer_memory_cuda as pm
 
 class PeerMemoryPool(object):
 
diff --git a/apex/contrib/transducer/transducer.py b/apex/contrib/transducer/transducer.py
index c0a57dea7..784396275 100755
--- a/apex/contrib/transducer/transducer.py
+++ b/apex/contrib/transducer/transducer.py
@@ -1,7 +1,6 @@
 import torch
-from apex.op_builder import TransducerJointBuilder, TransducerLossBuilder  
-transducer_loss_cuda = TransducerLossBuilder().load()
-transducer_joint_cuda = TransducerJointBuilder().load()
+import transducer_loss_cuda
+import transducer_joint_cuda
 
 class TransducerJoint(torch.nn.Module):
     """Transducer joint
diff --git a/apex/contrib/xentropy/__init__.py b/apex/contrib/xentropy/__init__.py
index dcff69d53..7dff6a27a 100644
--- a/apex/contrib/xentropy/__init__.py
+++ b/apex/contrib/xentropy/__init__.py
@@ -1,8 +1,6 @@
 try:
     import torch
-    from apex.op_builder import XentropyBuilder
-    xentropy_cuda = XentropyBuilder().load()
-    
+    import xentropy_cuda
     from .softmax_xentropy import SoftmaxCrossEntropyLoss
     del torch
     del xentropy_cuda
diff --git a/apex/contrib/xentropy/softmax_xentropy.py b/apex/contrib/xentropy/softmax_xentropy.py
index 4a8f97f3c..33fbf8b21 100644
--- a/apex/contrib/xentropy/softmax_xentropy.py
+++ b/apex/contrib/xentropy/softmax_xentropy.py
@@ -1,7 +1,5 @@
 import torch
-from apex.op_builder import XentropyBuilder
-
-xentropy_cuda = XentropyBuilder().load()
+import xentropy_cuda
 
 class SoftmaxCrossEntropyLoss(torch.autograd.Function):
     @staticmethod
diff --git a/apex/fp16_utils/fp16_optimizer.py b/apex/fp16_utils/fp16_optimizer.py
index e9647e442..7c0dd397f 100755
--- a/apex/fp16_utils/fp16_optimizer.py
+++ b/apex/fp16_utils/fp16_optimizer.py
@@ -6,7 +6,7 @@
 
 from ..amp._amp_state import _amp_state, maybe_print
 from ..amp.scaler import LossScaler
-from apex.multi_tensor_apply import MultiTensorApply
+from ..multi_tensor_apply import multi_tensor_applier
 from .fp16util import model_grads_to_master_grads, master_params_to_model_params, clip_grad_norm
 
 # TODO:  Update overflow check + downscale to use Carl's fused kernel.
@@ -100,10 +100,8 @@ def __init__(self,
         self.clip_grad_norm = clip_grad_norm
 
         # TODO:  Centralize exposure and import error checking for the C backend.
-        multi_tensor_applier = MultiTensorApply(256*32)
         if multi_tensor_applier.available:
-            from apex.op_builder import AmpCBuilder
-            amp_C = AmpCBuilder().load()
+            import amp_C
             self.multi_tensor_scale = amp_C.multi_tensor_scale
             self._dummy_overflow_buf = torch.cuda.IntTensor([0]);
 
@@ -160,7 +158,6 @@ def zero_grad(self, set_grads_to_None=False):
     #     self.loss_scaler.update_scale(has_overflow)
 
     def _master_params_to_model_params(self):
-        multi_tensor_applier = MultiTensorApply(256*32)
         if multi_tensor_applier.available:
             if len(self.all_fp16_params) > 0:
                 multi_tensor_applier(
diff --git a/apex/mlp/mlp.py b/apex/mlp/mlp.py
index fe3d382fd..31b853292 100644
--- a/apex/mlp/mlp.py
+++ b/apex/mlp/mlp.py
@@ -5,9 +5,8 @@
 from torch import nn
 
 from apex._autocast_utils import _cast_if_autocast_enabled
-from apex.op_builder import MlpBuilder  
+import mlp_cuda
 
-mlp_cuda = MlpBuilder().load()
 
 class MlpFunction(torch.autograd.Function):
     @staticmethod
diff --git a/apex/multi_tensor_apply/__init__.py b/apex/multi_tensor_apply/__init__.py
index 88de4cdfe..31e2a53de 100644
--- a/apex/multi_tensor_apply/__init__.py
+++ b/apex/multi_tensor_apply/__init__.py
@@ -1 +1,5 @@
-from .multi_tensor_apply import MultiTensorApply
\ No newline at end of file
+from .multi_tensor_apply import MultiTensorApply
+
+multi_tensor_applier = MultiTensorApply(256*32)
+multi_tensor_applier_l2norm = MultiTensorApply(2048*32)
+
diff --git a/apex/multi_tensor_apply/multi_tensor_apply.py b/apex/multi_tensor_apply/multi_tensor_apply.py
index 94ff3fe73..346c6e50f 100644
--- a/apex/multi_tensor_apply/multi_tensor_apply.py
+++ b/apex/multi_tensor_apply/multi_tensor_apply.py
@@ -6,8 +6,7 @@ class MultiTensorApply(object):
 
     def __init__(self, chunk_size):
         try:
-            from apex.op_builder import AmpCBuilder
-            amp_C = AmpCBuilder().load()
+            import amp_C
             MultiTensorApply.available = True
             self.chunk_size = chunk_size
         except ImportError as err:
diff --git a/apex/normalization/fused_layer_norm.py b/apex/normalization/fused_layer_norm.py
index 493271577..0c7bd2e09 100644
--- a/apex/normalization/fused_layer_norm.py
+++ b/apex/normalization/fused_layer_norm.py
@@ -8,7 +8,6 @@
 from typing import List, Tuple
 
 from apex._autocast_utils import _cast_if_autocast_enabled
-from apex.op_builder import FusedLayerNormBuilder
 
 global fused_layer_norm_cuda
 fused_layer_norm_cuda = None
@@ -41,7 +40,7 @@ class FusedLayerNormAffineFunction(torch.autograd.Function):
     def forward(ctx, input, weight, bias, normalized_shape, eps, memory_efficient=False):
         global fused_layer_norm_cuda
         if fused_layer_norm_cuda is None:
-            fused_layer_norm_cuda = FusedLayerNormBuilder().load()
+            fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
         ctx.normalized_shape = normalized_shape
         ctx.eps = eps
         ctx.memory_efficient = memory_efficient
@@ -81,7 +80,7 @@ def fused_layer_norm_affine_fwd(
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         global fused_layer_norm_cuda
         if fused_layer_norm_cuda is None:
-            fused_layer_norm_cuda = FusedLayerNormBuilder().load()
+            fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
 
         input_ = input.contiguous()
         weight_ = weight.contiguous()
@@ -198,7 +197,7 @@ class FusedRMSNormAffineFunction(torch.autograd.Function):
     def forward(ctx, input, weight, normalized_shape, eps, memory_efficient=False):
         global fused_layer_norm_cuda
         if fused_layer_norm_cuda is None:
-            fused_layer_norm_cuda = FusedLayerNormBuilder().load()
+            fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
         ctx.normalized_shape = normalized_shape
         ctx.eps = eps
         ctx.memory_efficient = memory_efficient
@@ -233,7 +232,7 @@ def fused_rms_norm_affine_fwd(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         global fused_layer_norm_cuda
         if fused_layer_norm_cuda is None:
-            fused_layer_norm_cuda = FusedLayerNormBuilder().load()
+            fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
 
         input_ = input.contiguous()
         weight_ = weight.contiguous()
@@ -351,7 +350,7 @@ class FusedLayerNormAffineMixedDtypesFunction(FusedLayerNormAffineFunction):
     def forward(ctx, input, weight, bias, normalized_shape, eps, memory_efficient=False):
         global fused_layer_norm_cuda
         if fused_layer_norm_cuda is None:
-            fused_layer_norm_cuda = FusedLayerNormBuilder().load()
+            fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
         ctx.normalized_shape = normalized_shape
         ctx.eps = eps
         ctx.memory_efficient = memory_efficient
@@ -374,7 +373,7 @@ class FusedRMSNormAffineMixedDtypesFunction(FusedRMSNormAffineFunction):
     def forward(ctx, input, weight, normalized_shape, eps, memory_efficient=False):
         global fused_layer_norm_cuda
         if fused_layer_norm_cuda is None:
-            fused_layer_norm_cuda = FusedLayerNormBuilder().load()
+            fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
         ctx.normalized_shape = normalized_shape
         ctx.eps = eps
         ctx.memory_efficient = memory_efficient
@@ -395,7 +394,7 @@ class FusedLayerNormFunction(torch.autograd.Function):
     def forward(ctx, input, normalized_shape, eps, memory_efficient=False):
         global fused_layer_norm_cuda
         if fused_layer_norm_cuda is None:
-            fused_layer_norm_cuda = FusedLayerNormBuilder().load()
+            fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
         ctx.normalized_shape = normalized_shape
         ctx.eps = eps
         ctx.memory_efficient = memory_efficient
@@ -428,7 +427,7 @@ def fused_layer_norm_fwd(
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         global fused_layer_norm_cuda
         if fused_layer_norm_cuda is None:
-            fused_layer_norm_cuda = FusedLayerNormBuilder().load()
+            fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
 
         input_ = input.contiguous()
         output, mean, invvar = fused_layer_norm_cuda.forward(
@@ -526,7 +525,7 @@ class FusedRMSNormFunction(torch.autograd.Function):
     def forward(ctx, input, normalized_shape, eps, memory_efficient=False):
         global fused_layer_norm_cuda
         if fused_layer_norm_cuda is None:
-            fused_layer_norm_cuda = FusedLayerNormBuilder().load()
+            fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
         ctx.normalized_shape = normalized_shape
         ctx.eps = eps
         ctx.memory_efficient = memory_efficient
@@ -559,7 +558,7 @@ def fused_rms_norm_fwd(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         global fused_layer_norm_cuda
         if fused_layer_norm_cuda is None:
-            fused_layer_norm_cuda = FusedLayerNormBuilder().load()
+            fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
 
         input_ = input.contiguous()
         output, invvar = fused_layer_norm_cuda.rms_forward(
@@ -774,7 +773,7 @@ def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True, memory_e
         super().__init__()
 
         global fused_layer_norm_cuda
-        fused_layer_norm_cuda = FusedLayerNormBuilder().load()
+        fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
 
         if isinstance(normalized_shape, numbers.Integral):
             normalized_shape = (normalized_shape,)
@@ -873,7 +872,7 @@ def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True, memory_e
         super().__init__()
 
         global fused_layer_norm_cuda
-        fused_layer_norm_cuda = FusedLayerNormBuilder().load()
+        fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
 
         if isinstance(normalized_shape, numbers.Integral):
             normalized_shape = (normalized_shape,)
diff --git a/apex/optimizers/fused_adagrad.py b/apex/optimizers/fused_adagrad.py
index ed3a93690..8d1ef6f32 100644
--- a/apex/optimizers/fused_adagrad.py
+++ b/apex/optimizers/fused_adagrad.py
@@ -1,5 +1,5 @@
 import torch
-from apex.multi_tensor_apply import MultiTensorApply
+from apex.multi_tensor_apply import multi_tensor_applier
 
 
 class FusedAdagrad(torch.optim.Optimizer):
@@ -48,10 +48,8 @@ def __init__(self, params, lr=1e-2, eps=1e-10,
         self.adagrad_w_mode = 1 if adagrad_w_mode else 0
         self.set_grad_none = set_grad_none
 
-        multi_tensor_applier = MultiTensorApply(256*32)
         if multi_tensor_applier.available:
-            from apex.op_builder import AmpCBuilder
-            amp_C = AmpCBuilder().load()
+            import amp_C
             # Skip buffer
             self._dummy_overflow_buf = torch.cuda.IntTensor([0])
             self.multi_tensor_adagrad = amp_C.multi_tensor_adagrad
@@ -104,7 +102,6 @@ def step(self, closure=None):
                 else:
                     raise RuntimeError('FusedAdagrad only support fp16, bfloat16 and fp32.')
 
-            multi_tensor_applier = MultiTensorApply(256*32)
             if(len(g_16) > 0):
                 multi_tensor_applier(self.multi_tensor_adagrad,
                                      self._dummy_overflow_buf,
diff --git a/apex/optimizers/fused_adam.py b/apex/optimizers/fused_adam.py
index 294269747..2ecfc077d 100644
--- a/apex/optimizers/fused_adam.py
+++ b/apex/optimizers/fused_adam.py
@@ -1,5 +1,5 @@
 import torch
-from apex.multi_tensor_apply import MultiTensorApply
+from apex.multi_tensor_apply import multi_tensor_applier
 
 class FusedAdam(torch.optim.Optimizer):
 
@@ -106,10 +106,8 @@ def __init__(self, params, lr=1e-3, bias_correction=True,
 
             self._step_supports_amp_scaling = True
 
-        multi_tensor_applier = MultiTensorApply(256*32)
         if multi_tensor_applier.available:
-            from apex.op_builder import AmpCBuilder
-            amp_C = AmpCBuilder().load()
+            import amp_C
             # Skip buffer
             self._dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device='cuda')
             self.multi_tensor_adam = amp_C.multi_tensor_adam
@@ -138,7 +136,6 @@ def step(self, closure=None, grads=None, output_params=None, scale=None, grad_no
         if any(p is not None for p in [grads, output_params, scale, grad_norms]):
             raise RuntimeError('FusedAdam has been updated.  Simply initialize it identically to torch.optim.Adam, and call step() with no arguments.')
         loss = None
-        multi_tensor_applier = MultiTensorApply(256*32)
         if closure is not None:
             loss = closure()
 
diff --git a/apex/optimizers/fused_lamb.py b/apex/optimizers/fused_lamb.py
index 37b5642ec..a77e0cd54 100644
--- a/apex/optimizers/fused_lamb.py
+++ b/apex/optimizers/fused_lamb.py
@@ -1,5 +1,5 @@
 import torch
-from apex.multi_tensor_apply import MultiTensorApply
+from apex.multi_tensor_apply import multi_tensor_applier, multi_tensor_applier_l2norm
 
 class FusedLAMB(torch.optim.Optimizer):
 
@@ -72,11 +72,8 @@ def __init__(self, params, lr=1e-3, bias_correction=True,
                         grad_averaging=grad_averaging,
                         max_grad_norm=max_grad_norm)
         super(FusedLAMB, self).__init__(params, defaults)
-        multi_tensor_applier = MultiTensorApply(256*32)
-        multi_tensor_applier_l2norm = MultiTensorApply(2048*32)
         if multi_tensor_applier.available and multi_tensor_applier_l2norm.available:
-            from apex.op_builder import AmpCBuilder
-            amp_C = AmpCBuilder().load()
+            import amp_C
             self.multi_tensor_l2norm=amp_C.multi_tensor_l2norm
             # Skip buffer
             self._dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device=self.param_groups[0]["params"][0].device)
@@ -123,8 +120,6 @@ def step(self, closure=None):
         device = self.param_groups[0]["params"][0].device
         g_norm_32, g_norm_16 = torch.zeros(1, device=device), torch.zeros(1, device=device)
         # compute grad norm for two lists
-        multi_tensor_applier = MultiTensorApply(256*32)
-        multi_tensor_applier_l2norm = MultiTensorApply(2048*32)
         if len(g_all_32) > 0:
             g_norm_32 = multi_tensor_applier_l2norm(self.multi_tensor_l2norm,
                                              self._dummy_overflow_buf,
diff --git a/apex/optimizers/fused_lars.py b/apex/optimizers/fused_lars.py
index de46f9203..3e60b2cce 100644
--- a/apex/optimizers/fused_lars.py
+++ b/apex/optimizers/fused_lars.py
@@ -2,7 +2,7 @@
 from torch.optim.optimizer import Optimizer, required
 from torch import nn
 from torch.nn.parameter import Parameter
-from apex.multi_tensor_apply import MultiTensorApply
+from apex.multi_tensor_apply import multi_tensor_applier
 
 class FusedLARS(Optimizer):
     def __init__(self, params, lr=required, momentum=0, dampening=0,
@@ -31,10 +31,8 @@ def __init__(self, params, lr=required, momentum=0, dampening=0,
         self.trust_coefficient = trust_coefficient
         self.eps = eps
 
-        multi_tensor_applier = MultiTensorApply(256*32)
         if multi_tensor_applier.available:
-            from apex.op_builder import AmpCBuilder
-            amp_C = AmpCBuilder().load()
+            import amp_C
             # Skip buffer
             self._dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device=self.param_groups[0]["params"][0].device)
             self.multi_tensor_l2norm = amp_C.multi_tensor_l2norm
@@ -84,7 +82,6 @@ def step(self, closure=None):
             closure (callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
-        multi_tensor_applier = MultiTensorApply(256*32)
         loss = None
         if closure is not None:
             loss = closure()
diff --git a/apex/optimizers/fused_mixed_precision_lamb.py b/apex/optimizers/fused_mixed_precision_lamb.py
index a7642b1d9..7ecda4f51 100644
--- a/apex/optimizers/fused_mixed_precision_lamb.py
+++ b/apex/optimizers/fused_mixed_precision_lamb.py
@@ -3,7 +3,7 @@
 from itertools import chain
 from collections import defaultdict, abc as container_abcs
 
-from apex.multi_tensor_apply import MultiTensorApply
+from apex.multi_tensor_apply import multi_tensor_applier, multi_tensor_applier_l2norm
 
 class FusedMixedPrecisionLamb(torch.optim.Optimizer):
 
@@ -31,12 +31,9 @@ def __init__(self, params, lr=1e-3, step=0, bias_correction=True,
         for idx,group in enumerate(self.param_groups):
             for item in tensor_state:
                 self.param_groups[idx][item] = group[item].to(device=device)
-        
-        multi_tensor_applier = MultiTensorApply(256*32)
-        multi_tensor_applier_l2norm = MultiTensorApply(2048*32)
+
         if multi_tensor_applier.available and multi_tensor_applier_l2norm.available:
-            from apex.op_builder import AmpCBuilder
-            amp_C = AmpCBuilder().load()
+            import amp_C
             self.multi_tensor_l2norm=amp_C.multi_tensor_l2norm_mp
             # Skip buffer
             self._dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device=device)
@@ -183,8 +180,6 @@ def step(self, closure=None, grad_scaler=None):
         # grad_norm is of scaled gradients.
         # So, multiply `max_grad_norm` by scale.
         max_grad_norm = self.defaults['max_grad_norm'] * scale
-        multi_tensor_applier = MultiTensorApply(256*32)
-        multi_tensor_applier_l2norm = MultiTensorApply(2048*32)
         grad_norm = multi_tensor_applier_l2norm(
             self.multi_tensor_l2norm,
             self._dummy_overflow_buf,
diff --git a/apex/optimizers/fused_novograd.py b/apex/optimizers/fused_novograd.py
index d0ff7b24c..b3ec5acb9 100644
--- a/apex/optimizers/fused_novograd.py
+++ b/apex/optimizers/fused_novograd.py
@@ -1,5 +1,5 @@
 import torch
-from apex.multi_tensor_apply import MultiTensorApply
+from apex.multi_tensor_apply import multi_tensor_applier
 
 class FusedNovoGrad(torch.optim.Optimizer):
 
@@ -76,10 +76,8 @@ def __init__(self, params, lr=1e-3, bias_correction=True,
                         grad_averaging=grad_averaging, norm_type=norm_type,
                         init_zero=init_zero)
         super(FusedNovoGrad, self).__init__(params, defaults)
-        multi_tensor_applier = MultiTensorApply(256*32)
         if multi_tensor_applier.available:
-            from apex.op_builder import AmpCBuilder
-            amp_C = AmpCBuilder().load()
+            import amp_C
             # Skip buffer
 
             # Creating the overflow buffer on the same device as the params tensors.
@@ -114,7 +112,6 @@ def step(self, closure=None):
             closure (callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
-        multi_tensor_applier = MultiTensorApply(256*32)
         loss = None
         if closure is not None:
             loss = closure()
diff --git a/apex/optimizers/fused_sgd.py b/apex/optimizers/fused_sgd.py
index d8eb409e6..88f26f27a 100644
--- a/apex/optimizers/fused_sgd.py
+++ b/apex/optimizers/fused_sgd.py
@@ -1,7 +1,7 @@
 import torch
 from torch.optim.optimizer import Optimizer, required
 
-from apex.multi_tensor_apply import MultiTensorApply
+from apex.multi_tensor_apply import multi_tensor_applier
 
 class FusedSGD(Optimizer):
     r"""Implements stochastic gradient descent (optionally with momentum).
@@ -97,10 +97,8 @@ def __init__(self, params, lr=required, momentum=0, dampening=0,
         self.scale_set_by_backward = False
         self.set_grad_none = set_grad_none
 
-        multi_tensor_applier = MultiTensorApply(256*32)
         if multi_tensor_applier.available:
-            from apex.op_builder import AmpCBuilder
-            amp_C = AmpCBuilder().load()
+            import amp_C
             # Skip buffer
             self._dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device=self.param_groups[0]["params"][0].device)
             self.multi_tensor_sgd = amp_C.multi_tensor_sgd
@@ -144,7 +142,6 @@ def step(self, closure=None):
             closure (callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
-        multi_tensor_applier = MultiTensorApply(256*32)
         loss = None
         if closure is not None:
             loss = closure()
diff --git a/apex/parallel/__init__.py b/apex/parallel/__init__.py
index a477c12a7..3cd7ae56e 100644
--- a/apex/parallel/__init__.py
+++ b/apex/parallel/__init__.py
@@ -12,6 +12,7 @@
 # for both the cuda-enabled and python-fallback versions, and I don't want
 # to suppress the error information.
 try:
+    import syncbn
     from .optimized_sync_batchnorm import SyncBatchNorm
 except ImportError as err:
     from .sync_batchnorm import SyncBatchNorm
diff --git a/apex/parallel/distributed.py b/apex/parallel/distributed.py
index 15afe6fc3..6aa6a6e8a 100644
--- a/apex/parallel/distributed.py
+++ b/apex/parallel/distributed.py
@@ -6,15 +6,14 @@
 from itertools import chain
 import copy
 import importlib
-from apex.multi_tensor_apply import MultiTensorApply
+from ..multi_tensor_apply import multi_tensor_applier
 
 imported_flatten_impl = False
 
 def import_flatten_impl():
     global flatten_impl, unflatten_impl, imported_flatten_impl
     try:
-        from apex.op_builder import ApexCBuilder
-        apex_C = ApexCBuilder().load()
+        import apex_C
         flatten_impl = apex_C.flatten
         unflatten_impl = apex_C.unflatten
     except ImportError:
@@ -244,11 +243,9 @@ def __init__(self,
                                     "torch.cuda.DoubleTensor" : 2,
                                     "torch.cuda.BFloat16Tensor" : 3}
 
-        multi_tensor_applier = MultiTensorApply(256*32)
         if multi_tensor_applier.available:
             # TODO:  I really need to centralize the C++ backed imports
-            from apex.op_builder import AmpCBuilder
-            amp_C = AmpCBuilder().load()
+            import amp_C
             self.multi_tensor_scale = amp_C.multi_tensor_scale
             self._overflow_buf = torch.cuda.IntTensor([0])
 
@@ -428,7 +425,6 @@ def _event_this_bucket(self, bucket_idx):
 
     def allreduce_bucket(self, bucket, bucket_idx, force_default_stream):
         tensor = flatten(bucket)
-        multi_tensor_applier = MultiTensorApply(256*32)
 
         if force_default_stream:
             bucket_stream = self.main_stream
diff --git a/apex/parallel/optimized_sync_batchnorm.py b/apex/parallel/optimized_sync_batchnorm.py
index 02828578f..65cf5eabf 100644
--- a/apex/parallel/optimized_sync_batchnorm.py
+++ b/apex/parallel/optimized_sync_batchnorm.py
@@ -1,8 +1,11 @@
 import torch
 from torch.nn.modules.batchnorm import _BatchNorm
 from torch.nn import functional as F
+
+import syncbn
 from .optimized_sync_batchnorm_kernel import SyncBatchnormFunction
 
+
 class SyncBatchNorm(_BatchNorm):
     """
     synchronized batch normalization module extented from `torch.nn.BatchNormNd`
diff --git a/apex/parallel/optimized_sync_batchnorm_kernel.py b/apex/parallel/optimized_sync_batchnorm_kernel.py
index ad0e3ba30..616847149 100644
--- a/apex/parallel/optimized_sync_batchnorm_kernel.py
+++ b/apex/parallel/optimized_sync_batchnorm_kernel.py
@@ -1,16 +1,13 @@
 import torch
 from torch.autograd.function import Function
 
-from apex.op_builder import SyncBnBuilder
+import syncbn
 from apex.parallel import ReduceOp
 
 class SyncBatchnormFunction(Function):
 
     @staticmethod
     def forward(ctx, input, z, weight, bias, running_mean, running_variance, eps, track_running_stats = True, momentum = 1.0, process_group = None, channel_last = False, fuse_relu = False):
-        
-        syncbn = SyncBnBuilder().load()
-        
         input = input.contiguous()
         world_size = 0
 
@@ -76,8 +73,6 @@ def forward(ctx, input, z, weight, bias, running_mean, running_variance, eps, tr
 
     @staticmethod
     def backward(ctx, grad_output):
-        syncbn = SyncBnBuilder().load()
-
         grad_output = grad_output.contiguous()
         # mini batch mean & var are calculated by forward path.
         # mu = 1./N*np.sum(h, axis = 0)
diff --git a/apex/transformer/functional/fused_rope.py b/apex/transformer/functional/fused_rope.py
index 7052d91e5..e74906151 100644
--- a/apex/transformer/functional/fused_rope.py
+++ b/apex/transformer/functional/fused_rope.py
@@ -50,7 +50,7 @@ def check_if_rocm_pytorch():
     except ImportError:
         AITER_ROPE_BACKEND = False
 if not AITER_ROPE_BACKEND:
-    from apex.op_builder import FusedRopeBuilder
+    import fused_rotary_positional_embedding
     warnings.warn("Using the native apex kernel for RoPE.", UserWarning)
 
 
@@ -86,7 +86,6 @@ def forward(
         freqs: torch.Tensor,
         transpose_output_memory: bool = False,
     ) -> torch.Tensor:
-        fused_rotary_positional_embedding = FusedRopeBuilder().load()
         output = fused_rotary_positional_embedding.forward(
             t, freqs, transpose_output_memory
         )
@@ -98,7 +97,6 @@ def forward(
     def backward(
         ctx, grad_output: torch.Tensor
     ) -> Tuple[Union[torch.Tensor, None], ...]:
-        fused_rotary_positional_embedding = FusedRopeBuilder().load()
         (freqs,) = ctx.saved_tensors
         grad_input = fused_rotary_positional_embedding.backward(
             grad_output, freqs, ctx.transpose_output_memory
@@ -213,7 +211,6 @@ def forward(
         sin_: torch.Tensor,
         transpose_output_memory: bool = False,
     ) -> torch.Tensor:
-        fused_rotary_positional_embedding = FusedRopeBuilder().load()
         output = fused_rotary_positional_embedding.forward_cached(
             t, cos_, sin_, transpose_output_memory
         )
@@ -226,7 +223,6 @@ def forward(
     def backward(
         ctx, grad_output: torch.Tensor
     ) -> Tuple[Union[torch.Tensor, None], ...]:
-        fused_rotary_positional_embedding = FusedRopeBuilder().load()
         cos_, sin_ = ctx.saved_tensors
         grad_input = fused_rotary_positional_embedding.backward_cached(
             grad_output, cos_, sin_, ctx.transpose_output_memory
@@ -339,7 +335,6 @@ def forward(
         cu_seqlens: torch.Tensor,
         freqs: torch.Tensor,
     ) -> torch.Tensor:
-        fused_rotary_positional_embedding = FusedRopeBuilder().load()
         output = fused_rotary_positional_embedding.forward_thd(
             t, cu_seqlens, freqs
         )
@@ -350,7 +345,6 @@ def forward(
     def backward(
         ctx, grad_output: torch.Tensor
     ) -> Tuple[Union[torch.Tensor, None], ...]:
-        fused_rotary_positional_embedding = FusedRopeBuilder().load()
         cu_seqlens, freqs = ctx.saved_tensors
         grad_input = fused_rotary_positional_embedding.backward_thd(
             grad_output, cu_seqlens, freqs
@@ -454,7 +448,6 @@ def forward(
         cos_w: torch.Tensor,
         sin_w: torch.Tensor,
     ) -> torch.Tensor:
-        fused_rotary_positional_embedding = FusedRopeBuilder().load()
         t = t.view(t.shape[0], img_h, img_w, t.shape[2], t.shape[3])
         output = fused_rotary_positional_embedding.forward_2d(
             t, cos_h, sin_h, cos_w, sin_w
@@ -468,8 +461,9 @@ def forward(
     def backward(
         ctx, grad_output: torch.Tensor
     ) -> Tuple[Union[torch.Tensor, None], ...]:
-        fused_rotary_positional_embedding = FusedRopeBuilder().load()
+
         cos_h, sin_h, cos_w, sin_w = ctx.saved_tensors
+
         grad_output = grad_output.view(
             grad_output.shape[0],
             ctx.img_h,
diff --git a/apex/transformer/functional/fused_softmax.py b/apex/transformer/functional/fused_softmax.py
index 54830d50a..83243ef7b 100644
--- a/apex/transformer/functional/fused_softmax.py
+++ b/apex/transformer/functional/fused_softmax.py
@@ -16,7 +16,6 @@
 
 from apex._autocast_utils import _cast_if_autocast_enabled
 from apex.transformer.enums import AttnMaskType
-from apex.op_builder import ScaledSoftmaxCudaBuilder, ScaledUpperTriangMaskedSoftmaxCudaBuilder, GenericScaledMaskedSoftmaxCudaBuilder, ScaledMaskedSoftmaxCudaBuilder
 
 
 class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
@@ -29,8 +28,7 @@ class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
 
     @staticmethod
     def forward(ctx, inputs, scale):
-
-        scaled_upper_triang_masked_softmax_cuda = ScaledUpperTriangMaskedSoftmaxCudaBuilder().load()
+        import scaled_upper_triang_masked_softmax_cuda
 
         scale_t = torch.tensor([scale])
         softmax_results = scaled_upper_triang_masked_softmax_cuda.forward(
@@ -42,8 +40,7 @@ def forward(ctx, inputs, scale):
 
     @staticmethod
     def backward(ctx, output_grads):
-
-        scaled_upper_triang_masked_softmax_cuda = ScaledUpperTriangMaskedSoftmaxCudaBuilder().load()
+        import scaled_upper_triang_masked_softmax_cuda
 
         softmax_results, scale_t = ctx.saved_tensors
         input_grads = scaled_upper_triang_masked_softmax_cuda.backward(
@@ -74,7 +71,7 @@ def scaled_upper_triang_masked_softmax(inputs, _, scale):
 class ScaledMaskedSoftmax(torch.autograd.Function):
     @staticmethod
     def forward(ctx, inputs, mask, scale):
-        scaled_masked_softmax_cuda = ScaledMaskedSoftmaxCudaBuilder().load()
+        import scaled_masked_softmax_cuda
 
         scale_t = torch.tensor([scale])
 
@@ -84,7 +81,7 @@ def forward(ctx, inputs, mask, scale):
 
     @staticmethod
     def backward(ctx, output_grads):
-        scaled_masked_softmax_cuda = ScaledMaskedSoftmaxCudaBuilder().load()
+        import scaled_masked_softmax_cuda
 
         softmax_results, scale_t = ctx.saved_tensors
 
@@ -109,8 +106,7 @@ def scaled_masked_softmax(inputs, mask, scale):
 class GenericScaledMaskedSoftmax(torch.autograd.Function):
     @staticmethod
     def forward(ctx, inputs, mask, scale):
-
-        generic_scaled_masked_softmax_cuda = GenericScaledMaskedSoftmaxCudaBuilder().load()
+        import generic_scaled_masked_softmax_cuda
 
         scale_t = torch.tensor([scale])
         softmax_results = generic_scaled_masked_softmax_cuda.forward(inputs, mask, scale_t[0])
@@ -119,8 +115,7 @@ def forward(ctx, inputs, mask, scale):
 
     @staticmethod
     def backward(ctx, output_grads):
-
-        generic_scaled_masked_softmax_cuda = GenericScaledMaskedSoftmaxCudaBuilder().load()
+        import generic_scaled_masked_softmax_cuda_new
 
         softmax_results, scale_t = ctx.saved_tensors
 
@@ -144,7 +139,7 @@ class ScaledSoftmax(torch.autograd.Function):
 
     @staticmethod
     def forward(ctx, inputs, scale):
-        scaled_softmax_cuda = ScaledSoftmaxCudaBuilder().load()
+        import scaled_softmax_cuda
 
         scale_t = torch.tensor([scale])
 
@@ -156,8 +151,7 @@ def forward(ctx, inputs, scale):
 
     @staticmethod
     def backward(ctx, output_grads):
-
-        scaled_softmax_cuda = ScaledSoftmaxCudaBuilder().load()
+        import scaled_softmax_cuda
 
         softmax_results, scale_t = ctx.saved_tensors
 
@@ -275,7 +269,6 @@ def forward_torch_softmax(self, input, mask):
 
     @staticmethod
     def get_batch_per_block(sq, sk, b, np):
-
-        scaled_masked_softmax_cuda = ScaledMaskedSoftmaxCudaBuilder().load()
+        import scaled_masked_softmax_cuda
 
         return scaled_masked_softmax_cuda.get_batch_per_block(sq, sk, b, np)
diff --git a/apex/transformer/pipeline_parallel/utils.py b/apex/transformer/pipeline_parallel/utils.py
index e225336f9..ae550d0b9 100644
--- a/apex/transformer/pipeline_parallel/utils.py
+++ b/apex/transformer/pipeline_parallel/utils.py
@@ -19,11 +19,13 @@
 import torch
 from torch.nn.parallel import DistributedDataParallel
 
-from apex.multi_tensor_apply import MultiTensorApply
+from apex.multi_tensor_apply import multi_tensor_applier
 from apex.transformer import parallel_state
 from apex.transformer.enums import ModelType
 from apex.transformer.microbatches import build_num_microbatches_calculator
 from apex.transformer.pipeline_parallel._timers import _Timers
+if multi_tensor_applier.available:
+    import amp_C
 
 
 _GLOBAL_ARGS = None
@@ -225,10 +227,6 @@ def calc_params_l2_norm(model: torch.nn.Module, bf16: bool):
                 else:
                     params_data.append(param.data)
     # Calculate norm
-    multi_tensor_applier = MultiTensorApply(256*32)
-    if multi_tensor_applier.available:
-        from apex.op_builder import AmpCBuilder
-        amp_C = AmpCBuilder().load()
     dummy_overflow_buf = torch.cuda.IntTensor([0])
     norm, _ = multi_tensor_applier(
         amp_C.multi_tensor_l2norm, dummy_overflow_buf, [params_data], False  # no per-parameter norm
diff --git a/apex/transformer/tensor_parallel/layers.py b/apex/transformer/tensor_parallel/layers.py
index 0644f58f7..346dfaa7a 100644
--- a/apex/transformer/tensor_parallel/layers.py
+++ b/apex/transformer/tensor_parallel/layers.py
@@ -54,7 +54,7 @@
 
 _grad_accum_fusion_available = True
 try:
-    from apex.op_builder import FusedWeightGradientMlpCudaBuilder
+    import fused_weight_gradient_mlp_cuda
 except ImportError:
     _grad_accum_fusion_available = False
 
@@ -363,7 +363,6 @@ def backward(ctx, grad_output):
             )
 
         if ctx.gradient_accumulation_fusion:
-            fused_weight_gradient_mlp_cuda = FusedWeightGradientMlpCudaBuilder().load()
             if not ctx.use_16bit_in_wgrad_accum_fusion:
                 fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(
                     total_input, grad_output, weight.main_grad
diff --git a/examples/imagenet/main_amp.py b/examples/imagenet/main_amp.py
index a2de55a06..c4b0fdfd5 100644
--- a/examples/imagenet/main_amp.py
+++ b/examples/imagenet/main_amp.py
@@ -21,6 +21,7 @@
     from apex.parallel import DistributedDataParallel as DDP
     from apex.fp16_utils import *
     from apex import amp, optimizers
+    from apex.multi_tensor_apply import multi_tensor_applier
 except ImportError:
     raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this example.")
 
diff --git a/tests/L0/log_test.txt b/tests/L0/log_test.txt
deleted file mode 100644
index 2bdda3252..000000000
--- a/tests/L0/log_test.txt
+++ /dev/null
@@ -1,4959 +0,0 @@
-test_add_param_group (test_add_param_group.TestAddParamGroup) ... ok
-test_bce_is_float_with_allow_banned (test_basic_casts.TestBannedMethods) ... ok
-test_bce_raises_by_default (test_basic_casts.TestBannedMethods) ... ok
-test_batch_norm_is_match (test_basic_casts.TestBasicCastsBFloat16) ... ok
-test_conv2d_is_bfloat16 (test_basic_casts.TestBasicCastsBFloat16) ... skipped "test doesn't currently work on ROCm stack."
-test_group_norm_is_float (test_basic_casts.TestBasicCastsBFloat16) ... ok
-test_linear_is_bfloat16 (test_basic_casts.TestBasicCastsBFloat16) ... skipped "test doesn't currently work on ROCm stack."
-test_mse_loss_is_float (test_basic_casts.TestBasicCastsBFloat16) ... ok
-test_relu_is_match (test_basic_casts.TestBasicCastsBFloat16) ... ok
-test_softmax_is_float (test_basic_casts.TestBasicCastsBFloat16) ... ok
-test_batch_norm_is_match (test_basic_casts.TestBasicCastsHalf) ... ok
-test_conv2d_is_half (test_basic_casts.TestBasicCastsHalf) ... ok
-test_group_norm_is_float (test_basic_casts.TestBasicCastsHalf) ... ok
-test_linear_is_half (test_basic_casts.TestBasicCastsHalf) ... ok
-test_mse_loss_is_float (test_basic_casts.TestBasicCastsHalf) ... ok
-test_relu_is_match (test_basic_casts.TestBasicCastsHalf) ... ok
-test_softmax_is_float (test_basic_casts.TestBasicCastsHalf) ... ok
-test_cpu_is_float (test_basic_casts.TestTensorCastsBFloat16) ... ok
-test_matmul_method_is_bfloat16 (test_basic_casts.TestTensorCastsBFloat16) ... skipped "test doesn't currently work on ROCm stack."
-test_matmul_op_is_bfloat16 (test_basic_casts.TestTensorCastsBFloat16) ... skipped "test doesn't currently work on ROCm stack."
-test_pow_method_is_float (test_basic_casts.TestTensorCastsBFloat16) ... ok
-test_pow_op_is_float (test_basic_casts.TestTensorCastsBFloat16) ... ok
-test_sum_is_float (test_basic_casts.TestTensorCastsBFloat16) ... ok
-test_cpu_is_float (test_basic_casts.TestTensorCastsHalf) ... ok
-test_matmul_method_is_half (test_basic_casts.TestTensorCastsHalf) ... ok
-test_matmul_op_is_half (test_basic_casts.TestTensorCastsHalf) ... ok
-test_pow_method_is_float (test_basic_casts.TestTensorCastsHalf) ... ok
-test_pow_op_is_float (test_basic_casts.TestTensorCastsHalf) ... ok
-test_sum_is_float (test_basic_casts.TestTensorCastsHalf) ... ok
-test_blacklist_module_bfp16_weight (test_cache.TestCache) ... ok
-test_blacklist_module_fp16_weight (test_cache.TestCache) ... ok
-test_blacklist_module_fp32_weight (test_cache.TestCache) ... ok
-test_promote_module_bfp16_weight (test_cache.TestCache) ... ok
-test_promote_module_fp16_weight (test_cache.TestCache) ... ok
-test_promote_module_fp32_weight (test_cache.TestCache) ... ok
-test_whitelist_module_bfp16_weight (test_cache.TestCache) ... ok
-test_whitelist_module_fp16_weight (test_cache.TestCache) ... ok
-test_whitelist_module_fp32_weight (test_cache.TestCache) ... ok
-test_loss_scale_decrease (test_checkpointing.TestCheckpointing) ... skipped 'Test is flaky.'
-test_restoring (test_checkpointing.TestCheckpointing) ... ok
-test_state_dict (test_checkpointing.TestCheckpointing) ... /skishore/github/pytorch/torch/utils/_device.py:100: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
-Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
-  return func(*args, **kwargs)
-ok
-test_2models2losses1optimizer (test_fused_sgd.TestMultipleModelsOptimizersLosses) ... ok
-test_2models2losses2optimizers (test_fused_sgd.TestMultipleModelsOptimizersLosses) ... ok
-test_3models2losses1optimizer (test_fused_sgd.TestMultipleModelsOptimizersLosses) ... ok
-test_3models2losses2optimizers (test_fused_sgd.TestMultipleModelsOptimizersLosses) ... ok
-test_larc_mixed_precision (test_larc.TestLARC) ... ok
-test_fuzz (test_multi_tensor_axpby.TestMultiTensorAxpby) ... ok
-test_fuzz_nhwc (test_multi_tensor_axpby.TestMultiTensorAxpby) ... ok
-test_fuzz (test_multi_tensor_l2norm.TestMultiTensorL2Norm) ... /skishore/github/apex/tests/L0/run_amp/test_multi_tensor_l2norm.py:37: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
-  a = torch.cuda.FloatTensor(sizea).fill_(self.val)
-ok
-test_fuzz (test_multi_tensor_scale.TestMultiTensorScale) ... ok
-test_2models2losses1optimizer (test_multiple_models_optimizers_losses.TestMultipleModelsOptimizersLosses) ... ok
-test_2models2losses2optimizers (test_multiple_models_optimizers_losses.TestMultipleModelsOptimizersLosses) ... ok
-test_3models2losses1optimizer (test_multiple_models_optimizers_losses.TestMultipleModelsOptimizersLosses) ... ok
-test_3models2losses2optimizers (test_multiple_models_optimizers_losses.TestMultipleModelsOptimizersLosses) ... ok
-test_cat_matches_widest (test_promotion.TestPromotionBFloat16) ... ok
-test_inplace_add_matches_self (test_promotion.TestPromotionBFloat16) ... ok
-test_inplace_exp_is_error_for_bfloat16 (test_promotion.TestPromotionBFloat16) ... ok
-test_mul_matches_widest (test_promotion.TestPromotionBFloat16) ... ok
-test_atan2_matches_widest (test_promotion.TestPromotionHalf) ... ok
-test_cat_matches_widest (test_promotion.TestPromotionHalf) ... ok
-test_inplace_add_matches_self (test_promotion.TestPromotionHalf) ... ok
-test_inplace_exp_is_error_for_half (test_promotion.TestPromotionHalf) ... ok
-test_mul_matches_widest (test_promotion.TestPromotionHalf) ... ok
-test_gru_cell_is_half (test_rnn.TestRnnCells) ... ok
-test_lstm_cell_is_half (test_rnn.TestRnnCells) ... ok
-test_rnn_cell_is_half (test_rnn.TestRnnCells) ... ok
-test_gru_is_half (test_rnn.TestRnns) ... skipped "test doesn't currently work on ROCm stack."
-test_lstm_is_half (test_rnn.TestRnns) ... skipped "test doesn't currently work on ROCm stack."
-test_rnn_is_half (test_rnn.TestRnns) ... skipped "test doesn't currently work on ROCm stack."
-test_rnn_packed_sequence (test_rnn.TestRnns) ... skipped "test doesn't currently work on ROCm stack."
-
-----------------------------------------------------------------------
-Ran 70 tests in 77.770s
-
-OK (skipped=9)
-test_output_is_half (test_fp16util.TestFP16Model) ... ok
-test_params_and_buffers (test_fp16util.TestFP16Model) ... ok
-
-----------------------------------------------------------------------
-Ran 2 tests in 0.577s
-
-OK
-testGradScaler (test_adam.AdamTest) ... ok
-testGradScalerCapturable (test_adam.AdamTest) ... /skishore/github/pytorch/torch/amp/grad_scaler.py:423: FutureWarning: GradScaler is going to stop passing itself as a keyword argument to the passed optimizer. In the near future GradScaler registers `grad_scale: Tensor` and `found_inf: Tensor` to the passed optimizer and let the optimizer use them directly.
-  warnings.warn(
-ok
-testGradScalerCapturableMaster (test_adam.AdamTest) ... ok
-testLargeTensor (test_adam.AdamTest) ... ok
-testNative (test_adam.AdamTest) ... ok
-test_float (test_fused_novograd.TestFusedNovoGrad) ... ok
-test_half (test_fused_novograd.TestFusedNovoGrad) ... ok
-test_multi_device (test_fused_novograd.TestFusedNovoGrad) ... ok
-test_multi_params (test_fused_novograd.TestFusedNovoGrad) ... ok
-test_adagrad_option (test_fused_optimizer.TestFusedAdagrad) ... ok
-test_float (test_fused_optimizer.TestFusedAdagrad) ... ok
-test_half (test_fused_optimizer.TestFusedAdagrad) ... skipped 'PyTorch optimizer is not numerically correct for fp16'
-test_multi_device (test_fused_optimizer.TestFusedAdagrad) ... ok
-test_multi_params (test_fused_optimizer.TestFusedAdagrad) ... ok
-test_multi_params_different_devices_throws (test_fused_optimizer.TestFusedAdagrad) ... ok
-test_adam_option (test_fused_optimizer.TestFusedAdam) ... ok
-test_bfloat16 (test_fused_optimizer.TestFusedAdam) ... skipped "test doesn't currently work on ROCm stack."
-test_float (test_fused_optimizer.TestFusedAdam) ... ok
-test_fp16_output (test_fused_optimizer.TestFusedAdam) ... skipped 'No longer support output fp16 param'
-test_half (test_fused_optimizer.TestFusedAdam) ... skipped 'NaN issue observed on ROCm as of 12/1/2021. The failing unit test is introduced by a PyTorch commit sometime in between rocm/pytorch:rocm4.3.1_ubuntu18.04_py3.6_pytorch_1.9.0 and 2021/12/01. Please refer to https://github.com/ROCmSoftwarePlatform/apex/issues/63'
-test_multi_device (test_fused_optimizer.TestFusedAdam) ... ok
-test_multi_params (test_fused_optimizer.TestFusedAdam) ... skipped 'Disable until 8/1/2019 adam/adamw upstream picked'
-test_scale (test_fused_optimizer.TestFusedAdam) ... skipped 'No longer support fuse scaling'
-test_float (test_fused_optimizer.TestFusedSGD) ... ok
-test_half (test_fused_optimizer.TestFusedSGD) ... ok
-test_multi_device (test_fused_optimizer.TestFusedSGD) ... ok
-test_float (test_fused_optimizer_channels_last.TestFusedSGDChannelLast) ... ok
-test_half (test_fused_optimizer_channels_last.TestFusedSGDChannelLast) ... ok
-test_multi_device (test_fused_optimizer_channels_last.TestFusedSGDChannelLast) ... ok
-test_float (test_lamb.TestFusedLAMB) ... ok
-test_half (test_lamb.TestFusedLAMB) ... skipped 'PyTorch optimizer is not numerically correct for fp16'
-test_lamb_option (test_lamb.TestFusedLAMB) ... ok
-test_multi_device (test_lamb.TestFusedLAMB) ... ok
-test_multi_params (test_lamb.TestFusedLAMB) ... ok
-test_float (test_lamb.TestFusedMixedPrecisionLamb) ... ok
-test_half (test_lamb.TestFusedMixedPrecisionLamb) ... skipped 'PyTorch optimizer is not numerically correct for fp16'
-test_lamb_option (test_lamb.TestFusedMixedPrecisionLamb) ... ok
-test_multi_device (test_lamb.TestFusedMixedPrecisionLamb) ... skipped 'Skipped the test since it failed the accuracy test on the PyTorch as of 8/1/2022. Please refer to https://github.com/ROCmSoftwarePlatform/apex/issues/83'
-test_multi_params (test_lamb.TestFusedMixedPrecisionLamb) ... ok
-
-----------------------------------------------------------------------
-Ran 39 tests in 15.676s
-
-OK (skipped=9)
-test_autocast_fused_layer_norm_bfloat16_elementwise_affine_False_memory_efficient_False_cuda_bfloat16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... /opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/_autocast_utils.py:26: FutureWarning: `torch.cuda.amp.autocast_mode._cast(value, dtype)` is deprecated. Please use `torch.amp.autocast_mode._cast(value, 'cuda', dtype)` instead.
-  return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())
-ok
-test_autocast_fused_layer_norm_bfloat16_elementwise_affine_False_memory_efficient_True_cuda_bfloat16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_autocast_fused_layer_norm_bfloat16_elementwise_affine_True_memory_efficient_False_cuda_bfloat16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_autocast_fused_layer_norm_bfloat16_elementwise_affine_True_memory_efficient_True_cuda_bfloat16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_autocast_fused_layer_norm_float16_elementwise_affine_False_memory_efficient_False_cuda_float16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_autocast_fused_layer_norm_float16_elementwise_affine_False_memory_efficient_True_cuda_float16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_autocast_fused_layer_norm_float16_elementwise_affine_True_memory_efficient_False_cuda_float16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_autocast_fused_layer_norm_float16_elementwise_affine_True_memory_efficient_True_cuda_float16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_autocast_fused_rms_norm_bfloat16_elementwise_affine_False_memory_efficient_False_cuda_bfloat16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_autocast_fused_rms_norm_bfloat16_elementwise_affine_False_memory_efficient_True_cuda_bfloat16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_autocast_fused_rms_norm_bfloat16_elementwise_affine_True_memory_efficient_False_cuda_bfloat16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_autocast_fused_rms_norm_bfloat16_elementwise_affine_True_memory_efficient_True_cuda_bfloat16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_autocast_fused_rms_norm_float16_elementwise_affine_False_memory_efficient_False_cuda_float16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_autocast_fused_rms_norm_float16_elementwise_affine_False_memory_efficient_True_cuda_float16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_autocast_fused_rms_norm_float16_elementwise_affine_True_memory_efficient_False_cuda_float16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_autocast_fused_rms_norm_float16_elementwise_affine_True_memory_efficient_True_cuda_float16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_compile_fused_layer_norm_elementwise_affine_False_cuda (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_compile_fused_layer_norm_elementwise_affine_True_cuda (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_compile_fused_rms_norm_elementwise_affine_False_cuda (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_compile_fused_rms_norm_elementwise_affine_True_cuda (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_layer_norm_bfloat16_batch_size_16_contiguous_False_elementwise_affine_True_mixed_fused_False_bfloat16_memory_efficient_False_cuda_bfloat16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_layer_norm_bfloat16_batch_size_16_contiguous_False_elementwise_affine_True_mixed_fused_False_bfloat16_memory_efficient_True_cuda_bfloat16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_layer_norm_bfloat16_batch_size_16_contiguous_True_elementwise_affine_True_mixed_fused_False_bfloat16_memory_efficient_False_cuda_bfloat16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_layer_norm_bfloat16_batch_size_16_contiguous_True_elementwise_affine_True_mixed_fused_False_bfloat16_memory_efficient_True_cuda_bfloat16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_layer_norm_elemwise_batch_size_16_contiguous_False_elementwise_affine_True_mixed_fused_False_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_layer_norm_elemwise_batch_size_16_contiguous_False_elementwise_affine_True_mixed_fused_False_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_layer_norm_elemwise_batch_size_16_contiguous_True_elementwise_affine_True_mixed_fused_False_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_layer_norm_elemwise_batch_size_16_contiguous_True_elementwise_affine_True_mixed_fused_False_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_layer_norm_elemwise_batch_size_65536_contiguous_False_elementwise_affine_True_mixed_fused_False_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_layer_norm_elemwise_batch_size_65536_contiguous_False_elementwise_affine_True_mixed_fused_False_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_layer_norm_elemwise_batch_size_65536_contiguous_True_elementwise_affine_True_mixed_fused_False_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_layer_norm_elemwise_batch_size_65536_contiguous_True_elementwise_affine_True_mixed_fused_False_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_layer_norm_export_cuda (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_layer_norm_half_batch_size_16_contiguous_False_elementwise_affine_True_mixed_fused_False_float16_memory_efficient_False_cuda_float16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_layer_norm_half_batch_size_16_contiguous_False_elementwise_affine_True_mixed_fused_False_float16_memory_efficient_True_cuda_float16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_layer_norm_half_batch_size_16_contiguous_True_elementwise_affine_True_mixed_fused_False_float16_memory_efficient_False_cuda_float16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_layer_norm_half_batch_size_16_contiguous_True_elementwise_affine_True_mixed_fused_False_float16_memory_efficient_True_cuda_float16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_layer_norm_mixed_batch_size_16_contiguous_False_elementwise_affine_True_mixed_fused_True_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_layer_norm_mixed_batch_size_16_contiguous_False_elementwise_affine_True_mixed_fused_True_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_layer_norm_mixed_batch_size_16_contiguous_True_elementwise_affine_True_mixed_fused_True_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_layer_norm_mixed_batch_size_16_contiguous_True_elementwise_affine_True_mixed_fused_True_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_layer_norm_mixed_batch_size_65536_contiguous_False_elementwise_affine_True_mixed_fused_True_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_layer_norm_mixed_batch_size_65536_contiguous_False_elementwise_affine_True_mixed_fused_True_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_layer_norm_mixed_batch_size_65536_contiguous_True_elementwise_affine_True_mixed_fused_True_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_layer_norm_mixed_batch_size_65536_contiguous_True_elementwise_affine_True_mixed_fused_True_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_layer_norm_regular_batch_size_16_contiguous_False_elementwise_affine_False_mixed_fused_False_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_layer_norm_regular_batch_size_16_contiguous_False_elementwise_affine_False_mixed_fused_False_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_layer_norm_regular_batch_size_16_contiguous_True_elementwise_affine_False_mixed_fused_False_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_layer_norm_regular_batch_size_16_contiguous_True_elementwise_affine_False_mixed_fused_False_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_layer_norm_regular_batch_size_65536_contiguous_False_elementwise_affine_False_mixed_fused_False_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_layer_norm_regular_batch_size_65536_contiguous_False_elementwise_affine_False_mixed_fused_False_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_layer_norm_regular_batch_size_65536_contiguous_True_elementwise_affine_False_mixed_fused_False_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_layer_norm_regular_batch_size_65536_contiguous_True_elementwise_affine_False_mixed_fused_False_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_rms_export_cuda (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_rms_norm_bfloat16_batch_size_16_contiguous_False_elementwise_affine_True_mixed_fused_False_bfloat16_memory_efficient_False_cuda_bfloat16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_rms_norm_bfloat16_batch_size_16_contiguous_False_elementwise_affine_True_mixed_fused_False_bfloat16_memory_efficient_True_cuda_bfloat16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_rms_norm_bfloat16_batch_size_16_contiguous_True_elementwise_affine_True_mixed_fused_False_bfloat16_memory_efficient_False_cuda_bfloat16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_rms_norm_bfloat16_batch_size_16_contiguous_True_elementwise_affine_True_mixed_fused_False_bfloat16_memory_efficient_True_cuda_bfloat16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_rms_norm_elemwise_batch_size_16_contiguous_False_elementwise_affine_True_mixed_fused_False_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_rms_norm_elemwise_batch_size_16_contiguous_False_elementwise_affine_True_mixed_fused_False_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_rms_norm_elemwise_batch_size_16_contiguous_True_elementwise_affine_True_mixed_fused_False_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_rms_norm_elemwise_batch_size_16_contiguous_True_elementwise_affine_True_mixed_fused_False_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_rms_norm_elemwise_batch_size_65536_contiguous_False_elementwise_affine_True_mixed_fused_False_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_rms_norm_elemwise_batch_size_65536_contiguous_False_elementwise_affine_True_mixed_fused_False_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_rms_norm_elemwise_batch_size_65536_contiguous_True_elementwise_affine_True_mixed_fused_False_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_rms_norm_elemwise_batch_size_65536_contiguous_True_elementwise_affine_True_mixed_fused_False_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_rms_norm_half_batch_size_16_contiguous_False_elementwise_affine_True_mixed_fused_False_float16_memory_efficient_False_cuda_float16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_rms_norm_half_batch_size_16_contiguous_False_elementwise_affine_True_mixed_fused_False_float16_memory_efficient_True_cuda_float16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_rms_norm_half_batch_size_16_contiguous_True_elementwise_affine_True_mixed_fused_False_float16_memory_efficient_False_cuda_float16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_rms_norm_half_batch_size_16_contiguous_True_elementwise_affine_True_mixed_fused_False_float16_memory_efficient_True_cuda_float16 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_rms_norm_mixed_batch_size_16_contiguous_False_elementwise_affine_True_mixed_fused_True_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_rms_norm_mixed_batch_size_16_contiguous_False_elementwise_affine_True_mixed_fused_True_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_rms_norm_mixed_batch_size_16_contiguous_True_elementwise_affine_True_mixed_fused_True_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_rms_norm_mixed_batch_size_16_contiguous_True_elementwise_affine_True_mixed_fused_True_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_rms_norm_mixed_batch_size_65536_contiguous_False_elementwise_affine_True_mixed_fused_True_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_rms_norm_mixed_batch_size_65536_contiguous_False_elementwise_affine_True_mixed_fused_True_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_rms_norm_mixed_batch_size_65536_contiguous_True_elementwise_affine_True_mixed_fused_True_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_rms_norm_mixed_batch_size_65536_contiguous_True_elementwise_affine_True_mixed_fused_True_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_rms_norm_regular_batch_size_16_contiguous_False_elementwise_affine_False_mixed_fused_False_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_rms_norm_regular_batch_size_16_contiguous_False_elementwise_affine_False_mixed_fused_False_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_rms_norm_regular_batch_size_16_contiguous_True_elementwise_affine_False_mixed_fused_False_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_rms_norm_regular_batch_size_16_contiguous_True_elementwise_affine_False_mixed_fused_False_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_rms_norm_regular_batch_size_65536_contiguous_False_elementwise_affine_False_mixed_fused_False_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_rms_norm_regular_batch_size_65536_contiguous_False_elementwise_affine_False_mixed_fused_False_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_rms_norm_regular_batch_size_65536_contiguous_True_elementwise_affine_False_mixed_fused_False_float32_memory_efficient_False_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-test_rms_norm_regular_batch_size_65536_contiguous_True_elementwise_affine_False_mixed_fused_False_float32_memory_efficient_True_cuda_float32 (test_fused_layer_norm.TestFusedLayerNormCUDA) ... ok
-
-----------------------------------------------------------------------
-Ran 86 tests in 108.763s
-
-OK
-test_creation (test_mlp.TestMLP) ... ok
-test_no_bias (test_mlp.TestMLP) ... skipped 'Test is flaky.'
-test_no_grad (test_mlp.TestMLP) ... skipped 'Test is flaky.'
-test_numeric (test_mlp.TestMLP) ... skipped 'Test is flaky.'
-test_performance_half (test_mlp.TestMLP) ... ok
-test_with_bias (test_mlp.TestMLP) ... skipped 'Test is flaky.'
-
-----------------------------------------------------------------------
-Ran 6 tests in 1.272s
-
-OK (skipped=4)
-test_fused_dense (test_fused_dense.FusedDenseTest) ... ok
-test_fused_dense_gelu_dense (test_gelu.FusedDenseGeluDenseTest) ... ok
-
-----------------------------------------------------------------------
-Ran 2 tests in 0.052s
-
-OK
-test_batch_sampler_behavior (test_batch_sampler.TestBatchSamplerBehavior) ... /skishore/github/apex/tests/L0/run_amp
-
-Executing tests from /skishore/github/apex/tests/L0/run_amp
-Warning:  unscaling grads that are not FP32. Unscaling non-fp32 grads may indicate an error. When using Amp, you don't need to call .half() on your model.
-/skishore/github/apex/tests/L0/run_fp16util
-
-Executing tests from /skishore/github/apex/tests/L0/run_fp16util
-/skishore/github/apex/tests/L0/run_optimizers
-
-Executing tests from /skishore/github/apex/tests/L0/run_optimizers
-/skishore/github/apex/tests/L0/run_fused_layer_norm
-
-Executing tests from /skishore/github/apex/tests/L0/run_fused_layer_norm
-/skishore/github/apex/tests/L0/run_mlp
-
-Executing tests from /skishore/github/apex/tests/L0/run_mlp
-
-Pytorch MLP time 1.4724 ms
-C++ MLP time 0.7349 ms
-/skishore/github/apex/tests/L0/run_fused_dense
-
-Executing tests from /skishore/github/apex/tests/L0/run_fused_dense
-/skishore/github/apex/tests/L0/run_transformer
-
-Executing tests from /skishore/github/apex/tests/L0/run_transformer
-ok
-test_split_batch (test_batch_sampler.TestBatchSamplerBehavior) ... ok
-test_cross_entropy (test_cross_entropy.NcclVocabParallelCrossEntropyTest) ... [rank3]:[W603 15:57:52.202502613 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 15:57:52.204052493 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 15:57:52.206180331 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 15:57:52.628771833 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-/skishore/github/apex/tests/L0/run_transformer/test_cross_entropy.py:28: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
-  target = torch.cuda.LongTensor(size=(batch_size, seq_length)).random_(0, vocab_size)
-/skishore/github/apex/tests/L0/run_transformer/test_cross_entropy.py:28: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
-  target = torch.cuda.LongTensor(size=(batch_size, seq_length)).random_(0, vocab_size)
-/skishore/github/apex/tests/L0/run_transformer/test_cross_entropy.py:28: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
-  target = torch.cuda.LongTensor(size=(batch_size, seq_length)).random_(0, vocab_size)
-/skishore/github/apex/tests/L0/run_transformer/test_cross_entropy.py:28: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
-  target = torch.cuda.LongTensor(size=(batch_size, seq_length)).random_(0, vocab_size)
-[dist init] rank = 0, world_size = 4
-[dist init] rank = 3, world_size = 4
-[dist init] rank = 2, world_size = 4
-[dist init] rank = 1, world_size = 4
-ok
-test_cross_entropy (test_cross_entropy.UccVocabParallelCrossEntropyTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_broadcast_data (test_data.NcclBroadcastDataTest) ... [rank3]:[W603 15:58:13.144932380 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 15:58:13.146511574 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 15:58:13.150470229 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 15:58:13.156459458 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-25-06-03 15:58:15 - PID:18274 - rank:(0, 0, 0, 0) - parallel_state.py:145 - INFO - > initializing tensor model parallel with size 4
-25-06-03 15:58:15 - PID:18274 - rank:(0, 0, 0, 0) - parallel_state.py:150 - INFO - > initializing pipeline model parallel with size 1
-25-06-03 15:58:15 - PID:18274 - rank:(0, 0, 0, 0) - parallel_state.py:155 - INFO - > initializing data parallel with size 1
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/data.py:50: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
-  sizes_cuda = torch.cuda.LongTensor(sizes)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/data.py:50: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
-  sizes_cuda = torch.cuda.LongTensor(sizes)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/data.py:50: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
-  sizes_cuda = torch.cuda.LongTensor(sizes)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/data.py:50: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
-  sizes_cuda = torch.cuda.LongTensor(sizes)
-[dist init] rank = 0, world_size = 4
-[dist init] rank = 3, world_size = 4
-[dist init] rank = 2, world_size = 4
-[dist init] rank = 1, world_size = 4
-ok
-test_broadcast_data (test_data.UccBroadcastDataTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_fused_bias_swiglu (test_fused_bias_swiglu.TestFusedBiasSwiGLU) ... ok
-test_2d_forward_backward (test_fused_rope.TestFusedRoPE) ... ok
-test_forward_backward (test_fused_rope.TestFusedRoPE) ... ok
-test_thd_forward_backward (test_fused_rope.TestFusedRoPE) ... ok
-test_autocast_fused_scale_mask_softmax (test_fused_softmax.TestFusedScaleMaskSoftmax) ... /skishore/github/apex/tests/L0/run_transformer/test_fused_softmax.py:119: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(dtype=dtype):
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/_autocast_utils.py:26: FutureWarning: `torch.cuda.amp.autocast_mode._cast(value, dtype)` is deprecated. Please use `torch.amp.autocast_mode._cast(value, 'cuda', dtype)` instead.
-  return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/functional/fused_softmax.py:98: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(enabled=False):
-ok
-test_autocast_fused_upper_triangle_mask_softmax (test_fused_softmax.TestFusedScaleMaskSoftmax) ... /skishore/github/apex/tests/L0/run_transformer/test_fused_softmax.py:207: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(dtype=dtype):
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/functional/fused_softmax.py:59: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(enabled=False):
-ok
-test_fused_scale_mask_softmax (test_fused_softmax.TestFusedScaleMaskSoftmax)
-attention_scores.shape = [4, 12, 24, 24] ... ok
-test_fused_upper_triangle_mask_softmax (test_fused_softmax.TestFusedScaleMaskSoftmax)
-attn_weights.shape: [4, 12, 24, 24] ... ok
-test_affine_weight_init_column_parallel_cpu (test_layers.NcclTensorParallelLayerTest) ... Testing with data type: torch.float32
-Test succeeded for data type: torch.float32
-Testing with data type: torch.float64
-Test succeeded for data type: torch.float64
-Testing with data type: torch.float16
-Test succeeded for data type: torch.float16
-[rank1]:[W603 15:58:37.066884666 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 15:58:37.235722975 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 15:58:37.239866274 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 15:58:37.245574686 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[dist init] rank = 0, world_size = 4
-[dist init] rank = 3, world_size = 4
-[dist init] rank = 1, world_size = 4
-[dist init] rank = 2, world_size = 4
-ok
-test_affine_weight_init_column_parallel_gpu (test_layers.NcclTensorParallelLayerTest) ... [rank2]:[W603 15:58:57.652765635 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 15:58:58.377812841 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 15:58:58.434011858 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 15:58:58.436293735 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[dist init] rank = 0, world_size = 4
-[dist init] rank = 1, world_size = 4
-[dist init] rank = 3, world_size = 4
-[dist init] rank = 2, world_size = 4
-ok
-test_affine_weight_init_row_parallel_cpu (test_layers.NcclTensorParallelLayerTest) ... [rank0]:[W603 15:59:14.275444278 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 15:59:14.276935120 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 15:59:14.672555975 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 15:59:14.674465709 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[dist init] rank = 0, world_size = 4
-[dist init] rank = 3, world_size = 4
-[dist init] rank = 2, world_size = 4
-[dist init] rank = 1, world_size = 4
-ok
-test_affine_weight_init_row_parallel_gpu (test_layers.NcclTensorParallelLayerTest) ... [rank1]:[W603 15:59:29.087710480 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 15:59:30.057818801 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 15:59:30.066972767 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 15:59:30.168463263 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[dist init] rank = 0, world_size = 4
-[dist init] rank = 3, world_size = 4
-[dist init] rank = 1, world_size = 4
-[dist init] rank = 2, world_size = 4
-ok
-test_all_gather_parity (test_layers.NcclTensorParallelLayerTest) ... [rank0]:[W603 15:59:43.936977030 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 15:59:44.440270376 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 15:59:44.583908053 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 15:59:44.586542463 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-[dist init] rank = 1, world_size = 4
-[dist init] rank = 0, world_size = 4
-[dist init] rank = 3, world_size = 4
-[dist init] rank = 2, world_size = 4
-ok
-test_column_parallel_linear (test_layers.NcclTensorParallelLayerTest) ... [rank0]:[W603 16:00:01.921646084 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:00:01.978806709 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:00:01.015221152 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:00:02.338829601 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[dist init] rank = 0, world_size = 4
-[dist init] rank = 3, world_size = 4
-[dist init] rank = 1, world_size = 4
-[dist init] rank = 2, world_size = 4
-ok
-test_column_parallel_linear_async (test_layers.NcclTensorParallelLayerTest) ... [rank2]:[W603 16:00:28.003290927 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:00:28.006099365 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:00:29.187504064 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:00:29.407647177 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[dist init] rank = 0, world_size = 4
-[dist init] rank = 3, world_size = 4
-[dist init] rank = 1, world_size = 4
-[dist init] rank = 2, world_size = 4
-ok
-test_column_parallel_linear_exception (test_layers.NcclTensorParallelLayerTest) ... [rank2]:[W603 16:00:55.497104619 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:00:55.499103466 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:00:55.499710677 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:00:55.505389085 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[dist init] rank = 0, world_size = 4
-[dist init] rank = 3, world_size = 4
-[dist init] rank = 1, world_size = 4
-[dist init] rank = 2, world_size = 4
-ok
-test_column_parallel_linear_gradient_accumulation_fusion (test_layers.NcclTensorParallelLayerTest) ... [rank0]:[W603 16:01:08.438194094 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:01:08.541519924 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:01:08.588451817 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:01:08.668895046 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[dist init] rank = 0, world_size = 4
-[dist init] rank = 1, world_size = 4
-[dist init] rank = 3, world_size = 4
-[dist init] rank = 2, world_size = 4
-ok
-test_column_parallel_linear_gradient_accumulation_fusion_in_fp16 (test_layers.NcclTensorParallelLayerTest) ... [rank2]:[W603 16:01:36.871666800 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:01:36.875749479 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:01:36.881789253 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:01:37.327369926 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[dist init] rank = 0, world_size = 4
-[dist init] rank = 2, world_size = 4
-[dist init] rank = 1, world_size = 4
-[dist init] rank = 3, world_size = 4
-ok
-test_column_parallel_linear_sequence_parallel (test_layers.NcclTensorParallelLayerTest) ... [rank1]:[W603 16:02:03.137927117 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:02:03.145352918 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:02:03.157190847 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:02:03.507710155 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:547: UserWarning: `sequence_parallel_enabled` is set to `True`, but got world_size of 1
-  warnings.warn(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:547: UserWarning: `sequence_parallel_enabled` is set to `True`, but got world_size of 1
-  warnings.warn(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:547: UserWarning: `sequence_parallel_enabled` is set to `True`, but got world_size of 1
-  warnings.warn(
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:547: UserWarning: `sequence_parallel_enabled` is set to `True`, but got world_size of 1
-  warnings.warn(
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  handle = torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  handle = torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  handle = torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  handle = torch.distributed._reduce_scatter_base(
-[dist init] rank = 3, world_size = 4
-[dist init] rank = 1, world_size = 4
-[dist init] rank = 2, world_size = 4
-[dist init] rank = 0, world_size = 4
-ok
-test_parallel_embedding (test_layers.NcclTensorParallelLayerTest) ... [rank2]:[W603 16:02:32.355274319 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:02:32.358432266 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:02:32.358927070 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:02:32.815480058 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[dist init] rank = 3, world_size = 4
-[dist init] rank = 0, world_size = 4
-[dist init] rank = 1, world_size = 4
-[dist init] rank = 2, world_size = 4
-ok
-test_reduce_scatter_parity (test_layers.NcclTensorParallelLayerTest) ... [rank3]:[W603 16:02:51.824959503 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:02:51.828917257 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:02:51.871644950 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:02:51.975317575 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-/skishore/github/apex/tests/L0/run_transformer/test_layers.py:127: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  _reduce_scatter_base(
-/skishore/github/apex/tests/L0/run_transformer/test_layers.py:127: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  _reduce_scatter_base(
-/skishore/github/apex/tests/L0/run_transformer/test_layers.py:127: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  _reduce_scatter_base(
-/skishore/github/apex/tests/L0/run_transformer/test_layers.py:127: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  _reduce_scatter_base(
-[dist init] rank = 0, world_size = 4
-[dist init] rank = 3, world_size = 4
-[dist init] rank = 2, world_size = 4
-[dist init] rank = 1, world_size = 4
-ok
-test_row_parallel_linear (test_layers.NcclTensorParallelLayerTest) ... [rank1]:[W603 16:03:15.587822867 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:03:15.593311653 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:03:15.595137332 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:03:15.597665144 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[dist init] rank = 0, world_size = 4
-[dist init] rank = 3, world_size = 4
-[dist init] rank = 2, world_size = 4
-[dist init] rank = 1, world_size = 4
-ok
-test_row_parallel_linear_gradient_accumulation_fusion (test_layers.NcclTensorParallelLayerTest) ... [rank0]:[W603 16:03:46.418085342 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:03:46.426745535 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:03:46.428857389 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:03:46.679548820 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[dist init] rank = 0, world_size = 4
-[dist init] rank = 1, world_size = 4
-[dist init] rank = 2, world_size = 4
-[dist init] rank = 3, world_size = 4
-ok
-test_row_parallel_linear_gradient_accumulation_fusion_in_fp16 (test_layers.NcclTensorParallelLayerTest) ... [rank1]:[W603 16:04:17.567956750 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:04:17.569834858 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:04:17.571835267 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:04:17.572720190 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[dist init] rank = 0, world_size = 4
-[dist init] rank = 3, world_size = 4
-[dist init] rank = 1, world_size = 4
-[dist init] rank = 2, world_size = 4
-ok
-test_row_parallel_linear_sequence_parallel (test_layers.NcclTensorParallelLayerTest) ... [rank1]:[W603 16:04:43.923494574 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:04:43.927424246 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:04:43.934572735 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:04:43.936981620 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-[dist init] rank = 3, world_size = 4
-[dist init] rank = 2, world_size = 4
-[dist init] rank = 0, world_size = 4
-[dist init] rank = 1, world_size = 4
-ok
-test_affine_weight_init_column_parallel_cpu (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_affine_weight_init_column_parallel_gpu (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_affine_weight_init_row_parallel_cpu (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_affine_weight_init_row_parallel_gpu (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_all_gather_parity (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_column_parallel_linear (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_column_parallel_linear_async (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_column_parallel_linear_exception (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_column_parallel_linear_gradient_accumulation_fusion (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_column_parallel_linear_gradient_accumulation_fusion_in_fp16 (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_column_parallel_linear_sequence_parallel (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_parallel_embedding (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_reduce_scatter_parity (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_row_parallel_linear (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_row_parallel_linear_gradient_accumulation_fusion (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_row_parallel_linear_gradient_accumulation_fusion_in_fp16 (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_row_parallel_linear_sequence_parallel (test_layers.UccTensorParallelLayerTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_gather (test_mapping.NcclMappingTest) ... [rank0]:[W603 16:05:13.944825889 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:05:13.947762668 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:05:13.958828241 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:05:14.500687005 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[dist init] rank = 0, world_size = 4
-[dist init] rank = 3, world_size = 4
-[dist init] rank = 1, world_size = 4
-[dist init] rank = 2, world_size = 4
-ok
-test_reduce (test_mapping.NcclMappingTest) ... [rank1]:[W603 16:05:35.365038307 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:05:35.370537609 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:05:35.371594948 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:05:35.697203236 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[dist init] rank = 0, world_size = 4
-[dist init] rank = 3, world_size = 4
-[dist init] rank = 1, world_size = 4
-[dist init] rank = 2, world_size = 4
-ok
-test_split (test_mapping.NcclMappingTest) ... [rank1]:[W603 16:05:53.691328007 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:05:53.696602195 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:05:54.608251510 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:05:54.632199522 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[dist init] rank = 0, world_size = 4
-[dist init] rank = 3, world_size = 4
-[dist init] rank = 1, world_size = 4
-[dist init] rank = 2, world_size = 4
-ok
-test_gather (test_mapping.UccMappingTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_reduce (test_mapping.UccMappingTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_split (test_mapping.UccMappingTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_constant_microbatch_calculator (test_microbatches.NcclMicrobatchCalculatorTest) ... [rank2]:[W603 16:06:08.861642985 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:06:08.862476903 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:06:08.875005176 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:06:08.876428989 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
-  warnings.warn("This function is only for unittest")
-[dist init] rank = 0, world_size = 4
-[dist init] rank = 3, world_size = 4
-[dist init] rank = 2, world_size = 4
-[dist init] rank = 1, world_size = 4
-ok
-test_dynamic_microbatch_calculator (test_microbatches.NcclMicrobatchCalculatorTest) ... [rank2]:[W603 16:06:22.945233453 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:06:22.947858800 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:06:22.949114774 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:06:23.084504080 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
-  warnings.warn("This function is only for unittest")
-[dist init] rank = 3, world_size = 4
-[dist init] rank = 0, world_size = 4
-[dist init] rank = 2, world_size = 4
-[dist init] rank = 1, world_size = 4
-ok
-test_constant_microbatch_calculator (test_microbatches.UccMicrobatchCalculatorTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_dynamic_microbatch_calculator (test_microbatches.UccMicrobatchCalculatorTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_no_interleaving_warmup (test_p2p_comm.UccP2PCommTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_send_backward_recv_backward (test_p2p_comm.UccP2PCommTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_send_forward_recv_forward (test_p2p_comm.UccP2PCommTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_initialize_model_parallel (test_parallel_state.NcclParallelStateTest) ... [rank1]:[W603 16:06:37.933031514 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:06:37.937152991 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:06:37.938813605 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:06:38.597196668 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[dist init] rank = 3, world_size = 4
-[dist init] rank = 0, world_size = 4
-[dist init] rank = 1, world_size = 4
-[dist init] rank = 2, world_size = 4
-ok
-test_initialize_model_parallel_decoder_only (test_parallel_state.NcclParallelStateTest)
-Initialize model parallelism for decoder-only Transformers like GPT-3 ... [rank0]:[W603 16:06:52.865035520 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:06:52.884775276 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:06:52.886067374 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:06:53.276617324 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[dist init] rank = 0, world_size = 4
-[dist init] rank = 3, world_size = 4
-[dist init] rank = 1, world_size = 4
-[dist init] rank = 2, world_size = 4
-ok
-test_initialize_model_parallel_with_virtual_and_split (test_parallel_state.NcclParallelStateTest) ... [rank2]:[W603 16:07:06.873261341 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:07:06.883642889 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:07:06.885682606 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:07:07.471477788 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[dist init] rank = 0, world_size = 4
-[dist init] rank = 3, world_size = 4
-[dist init] rank = 2, world_size = 4
-[dist init] rank = 1, world_size = 4
-ok
-test_initialize_model_parallel (test_parallel_state.UccParallelStateTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_initialize_model_parallel_decoder_only (test_parallel_state.UccParallelStateTest)
-Initialize model parallelism for decoder-only Transformers like GPT-3 ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_initialize_model_parallel_with_virtual_and_split (test_parallel_state.UccParallelStateTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_inference_async_pipelining_with_interleaving (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest) ... [rank5]:[W603 16:07:22.584738687 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:07:22.591937221 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:07:22.593373272 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:07:22.027355011 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:07:23.106428187 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank7]:[W603 16:07:23.145665811 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank4]:[W603 16:07:23.146369115 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank6]:[W603 16:07:23.153563473 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[dist init] rank = 4, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
-[dist init] rank = 2, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
-  warnings.warn("This function is only for unittest")
-[dist init] rank = 6, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
-[dist init] rank = 0, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
-Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
-  self.assertEqual(x.item() / microbatch_size, target_loss.item())
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
-Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
-  self.assertEqual(x.item() / microbatch_size, target_loss.item())
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 278, in test_inference_async_pipelining_with_interleaving
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-qmtFLO (size 8257920), error: No space left on device (28)
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_with_interleaving
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank6]:E0603 16:08:24.922000 42075 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 6 with exit code: 10
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 278, in test_inference_async_pipelining_with_interleaving
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 276, in _forward_backward_pipelining_with_interleaving
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = p2p_communication.send_forward_recv_forward(
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 515, in send_forward_recv_forward
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.send([tensor], group_dst, tag)
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-CTs4dF (size 8257920), error: No space left on device (28)
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_with_interleaving
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank0]:E0603 16:08:24.923000 42069 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 0 with exit code: 10
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 278, in test_inference_async_pipelining_with_interleaving
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-ckiWx4 (size 8257920), error: No space left on device (28)
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_with_interleaving
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank4]:E0603 16:08:24.924000 42073 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 4 with exit code: 10
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 278, in test_inference_async_pipelining_with_interleaving
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-hsiffW (size 8257920), error: No space left on device (28)
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_with_interleaving
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank2]:E0603 16:08:24.924000 42071 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 2 with exit code: 10
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 278, in test_inference_async_pipelining_with_interleaving
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-BZatXC (size 8257920), error: No space left on device (28)
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_with_interleaving
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank3]:E0603 16:08:24.944000 42072 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 3 with exit code: 10
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 278, in test_inference_async_pipelining_with_interleaving
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-JfUoFZ (size 8257920), error: No space left on device (28)
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_with_interleaving
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank5]:E0603 16:08:24.944000 42074 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 5 with exit code: 10
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 278, in test_inference_async_pipelining_with_interleaving
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-LdJXc3 (size 8257920), error: No space left on device (28)
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_with_interleaving
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank7]:E0603 16:08:24.944000 42076 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 7 with exit code: 10
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 278, in test_inference_async_pipelining_with_interleaving
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 276, in _forward_backward_pipelining_with_interleaving
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = p2p_communication.send_forward_recv_forward(
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 515, in send_forward_recv_forward
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.send([tensor], group_dst, tag)
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-DcKIWl (size 8257920), error: No space left on device (28)
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_with_interleaving
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank1]:E0603 16:08:24.944000 42070 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 1 with exit code: 10
-[dist init] rank = 3, world_size = 8
-[dist init] rank = 5, world_size = 8
-[dist init] rank = 1, world_size = 8
-[dist init] rank = 7, world_size = 8
-[rank0]:[W603 16:08:25.303926414 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank4]:[W603 16:08:25.362910114 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank6]:[W603 16:08:25.383088872 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank2]:[W603 16:08:25.384296404 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank3]:[W603 16:08:25.847527234 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank5]:[W603 16:08:25.861409137 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank7]:[W603 16:08:25.913980770 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank1]:[W603 16:08:25.010703691 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-ERROR
-test_inference_async_pipelining_without_interleaving (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest) ... Process 3 terminated with exit code 10, terminating remaining processes.
-[rank5]:[W603 16:08:47.287850779 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:08:47.298819678 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank6]:[W603 16:08:47.301756687 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:08:47.427623361 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:08:47.498550246 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank4]:[W603 16:08:47.542999745 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank7]:[W603 16:08:47.836790322 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:08:47.847676459 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[dist init] rank = 6, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
-[dist init] rank = 4, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
-[dist init] rank = 2, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
-  warnings.warn("This function is only for unittest")
-[dist init] rank = 0, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
-Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
-  self.assertEqual(x.item() / microbatch_size, target_loss.item())
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
-Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
-  self.assertEqual(x.item() / microbatch_size, target_loss.item())
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 254, in test_inference_async_pipelining_without_interleaving
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 332, in forward_backward_pipelining_without_interleaving
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = recv_forward(
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-fcsADG (size 8257920), error: No space left on device (28)
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_without_interleaving
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank2]:E0603 16:09:36.535000 46104 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 2 with exit code: 10
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 254, in test_inference_async_pipelining_without_interleaving
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 349, in forward_backward_pipelining_without_interleaving
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     send_forward(
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 145, in send_forward
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.send_forward(
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 401, in send_forward
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     _communicate(
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.send([tensor], group_dst, tag)
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-JQj2bD (size 8257920), error: No space left on device (28)
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_without_interleaving
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank0]:E0603 16:09:36.536000 46102 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 0 with exit code: 10
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 254, in test_inference_async_pipelining_without_interleaving
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 367, in forward_backward_pipelining_without_interleaving
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor: List[Union[None, torch.Tensor, FutureTensor]] = recv_forward(tensor_shapes=recv_tensor_shapes, dtype=dtype, async_comm=async_comm)
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-jXAXk8 (size 8257920), error: No space left on device (28)
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_without_interleaving
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank6]:E0603 16:09:36.536000 46108 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 6 with exit code: 10
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 254, in test_inference_async_pipelining_without_interleaving
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 332, in forward_backward_pipelining_without_interleaving
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = recv_forward(
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-py8DkW (size 8257920), error: No space left on device (28)
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_without_interleaving
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank4]:E0603 16:09:36.536000 46106 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 4 with exit code: 10
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 254, in test_inference_async_pipelining_without_interleaving
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 332, in forward_backward_pipelining_without_interleaving
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = recv_forward(
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-MOrMPr (size 8257920), error: No space left on device (28)
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_without_interleaving
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank5]:E0603 16:09:36.557000 46107 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 5 with exit code: 10
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 254, in test_inference_async_pipelining_without_interleaving
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 332, in forward_backward_pipelining_without_interleaving
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = recv_forward(
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-zATTI0 (size 8257920), error: No space left on device (28)
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_without_interleaving
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank3]:E0603 16:09:36.557000 46105 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 3 with exit code: 10
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 254, in test_inference_async_pipelining_without_interleaving
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 367, in forward_backward_pipelining_without_interleaving
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor: List[Union[None, torch.Tensor, FutureTensor]] = recv_forward(tensor_shapes=recv_tensor_shapes, dtype=dtype, async_comm=async_comm)
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-XMZVej (size 8257920), error: No space left on device (28)
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_without_interleaving
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank7]:E0603 16:09:36.557000 46109 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 7 with exit code: 10
-[dist init] rank = 5, world_size = 8
-[dist init] rank = 7, world_size = 8
-[dist init] rank = 3, world_size = 8
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 254, in test_inference_async_pipelining_without_interleaving
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 349, in forward_backward_pipelining_without_interleaving
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     send_forward(
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 145, in send_forward
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.send_forward(
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 401, in send_forward
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     _communicate(
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.send([tensor], group_dst, tag)
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-G7NK1l (size 8257920), error: No space left on device (28)
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_without_interleaving
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank1]:E0603 16:09:36.558000 46103 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 1 with exit code: 10
-[dist init] rank = 1, world_size = 8
-[rank2]:[W603 16:09:36.882339790 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank4]:[W603 16:09:36.885695790 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank0]:[W603 16:09:36.995526595 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank6]:[W603 16:09:36.004199016 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank3]:[W603 16:09:37.466539945 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank1]:[W603 16:09:37.478031881 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank5]:[W603 16:09:37.493209837 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank7]:[W603 16:09:37.595542162 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-ERROR
-test_inference_no_pipelining (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest) ... Process 1 terminated with exit code 10, terminating remaining processes.
-[rank7]:[W603 16:09:58.973415270 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank5]:[W603 16:09:58.982096174 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:09:58.986582472 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:09:59.597541251 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:09:59.614054759 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank4]:[W603 16:09:59.618555048 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank6]:[W603 16:09:59.619095230 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:09:59.619297020 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
-  warnings.warn("This function is only for unittest")
-[dist init] rank = 0, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
-Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
-  self.assertEqual(x.item() / microbatch_size, target_loss.item())
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
-Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
-  self.assertEqual(x.item() / microbatch_size, target_loss.item())
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
-Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
-  self.assertEqual(x.item() / microbatch_size, target_loss.item())
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
-Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
-  self.assertEqual(x.item() / microbatch_size, target_loss.item())
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
-Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
-  self.assertEqual(x.item() / microbatch_size, target_loss.item())
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
-Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
-  self.assertEqual(x.item() / microbatch_size, target_loss.item())
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
-Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
-  self.assertEqual(x.item() / microbatch_size, target_loss.item())
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
-Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
-  self.assertEqual(x.item() / microbatch_size, target_loss.item())
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
-[dist init] rank = 5, world_size = 8
-[dist init] rank = 7, world_size = 8
-[dist init] rank = 6, world_size = 8
-[dist init] rank = 4, world_size = 8
-[dist init] rank = 3, world_size = 8
-[dist init] rank = 1, world_size = 8
-[dist init] rank = 2, world_size = 8
-ok
-test_inference_pipelining_with_interleaving (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest) ... [rank3]:[W603 16:11:13.919670522 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:11:13.938326980 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank5]:[W603 16:11:13.939926885 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank4]:[W603 16:11:13.941251481 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:11:13.942232367 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank7]:[W603 16:11:13.945283054 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank6]:[W603 16:11:13.956809010 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:11:13.960771772 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
-  warnings.warn("This function is only for unittest")
-[dist init] rank = 0, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
-[dist init] rank = 4, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
-[dist init] rank = 2, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
-[dist init] rank = 6, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
-Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
-  self.assertEqual(x.item() / microbatch_size, target_loss.item())
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
-Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
-  self.assertEqual(x.item() / microbatch_size, target_loss.item())
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 266, in test_inference_pipelining_with_interleaving
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-F0ZAWL (size 8257920), error: No space left on device (28)
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_with_interleaving
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank5]:E0603 16:12:26.862000 53769 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 5 with exit code: 10
-[dist init] rank = 5, world_size = 8
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 266, in test_inference_pipelining_with_interleaving
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 276, in _forward_backward_pipelining_with_interleaving
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = p2p_communication.send_forward_recv_forward(
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 515, in send_forward_recv_forward
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.send([tensor], group_dst, tag)
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-QgAUbi (size 8257920), error: No space left on device (28)
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_with_interleaving
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank1]:E0603 16:12:26.863000 53765 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 1 with exit code: 10
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 266, in test_inference_pipelining_with_interleaving
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-lidQW9 (size 8257920), error: No space left on device (28)
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_with_interleaving
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank7]:E0603 16:12:26.863000 53771 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 7 with exit code: 10
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 266, in test_inference_pipelining_with_interleaving
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-ya39Qh (size 8257920), error: No space left on device (28)
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_with_interleaving
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank3]:E0603 16:12:26.863000 53767 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 3 with exit code: 10
-[dist init] rank = 1, world_size = 8
-[dist init] rank = 7, world_size = 8
-[dist init] rank = 3, world_size = 8
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 266, in test_inference_pipelining_with_interleaving
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-7nTQUd (size 8257920), error: No space left on device (28)
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_with_interleaving
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank6]:E0603 16:12:26.877000 53770 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 6 with exit code: 10
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 266, in test_inference_pipelining_with_interleaving
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-UxtCFa (size 8257920), error: No space left on device (28)
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_with_interleaving
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank4]:E0603 16:12:26.877000 53768 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 4 with exit code: 10
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 266, in test_inference_pipelining_with_interleaving
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 276, in _forward_backward_pipelining_with_interleaving
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = p2p_communication.send_forward_recv_forward(
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 515, in send_forward_recv_forward
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.send([tensor], group_dst, tag)
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-VMshzE (size 8257920), error: No space left on device (28)
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_with_interleaving
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank0]:E0603 16:12:26.877000 53764 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 0 with exit code: 10
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 266, in test_inference_pipelining_with_interleaving
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-veLfEs (size 8257920), error: No space left on device (28)
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_with_interleaving
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank2]:E0603 16:12:26.877000 53766 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 2 with exit code: 10
-[rank6]:[W603 16:12:27.325486798 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank2]:[W603 16:12:27.341952636 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank4]:[W603 16:12:27.359584073 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank0]:[W603 16:12:27.428477181 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank7]:[W603 16:12:27.760319732 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank3]:[W603 16:12:27.767121477 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank1]:[W603 16:12:27.809854378 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank5]:[W603 16:12:27.932506932 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-ERROR
-test_inference_pipelining_without_interleaving (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest) ... Process 7 terminated with exit code 10, terminating remaining processes.
-[rank0]:[W603 16:12:49.087699947 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank5]:[W603 16:12:49.103615317 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank7]:[W603 16:12:49.117766760 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:12:49.126433052 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank4]:[W603 16:12:49.181958389 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:12:49.185125920 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank6]:[W603 16:12:49.191761358 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:12:49.201756693 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[dist init] rank = 2, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
-[dist init] rank = 4, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
-  warnings.warn("This function is only for unittest")
-[dist init] rank = 6, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
-[dist init] rank = 0, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
-Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
-  self.assertEqual(x.item() / microbatch_size, target_loss.item())
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
-Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
-  self.assertEqual(x.item() / microbatch_size, target_loss.item())
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 244, in test_inference_pipelining_without_interleaving
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 332, in forward_backward_pipelining_without_interleaving
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = recv_forward(
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-WflEm3 (size 8257920), error: No space left on device (28)
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_without_interleaving
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank4]:E0603 16:13:52.280000 57801 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 4 with exit code: 10
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 244, in test_inference_pipelining_without_interleaving
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 332, in forward_backward_pipelining_without_interleaving
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = recv_forward(
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-8PJSpS (size 8257920), error: No space left on device (28)
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_without_interleaving
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank2]:E0603 16:13:52.281000 57799 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 2 with exit code: 10
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 244, in test_inference_pipelining_without_interleaving
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 349, in forward_backward_pipelining_without_interleaving
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     send_forward(
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 145, in send_forward
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.send_forward(
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 401, in send_forward
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     _communicate(
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.send([tensor], group_dst, tag)
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-76Ma69 (size 8257920), error: No space left on device (28)
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_without_interleaving
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank0]:E0603 16:13:52.281000 57797 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 0 with exit code: 10
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 244, in test_inference_pipelining_without_interleaving
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 367, in forward_backward_pipelining_without_interleaving
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor: List[Union[None, torch.Tensor, FutureTensor]] = recv_forward(tensor_shapes=recv_tensor_shapes, dtype=dtype, async_comm=async_comm)
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-5OXFxk (size 8257920), error: No space left on device (28)
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_without_interleaving
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank6]:E0603 16:13:52.283000 57803 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 6 with exit code: 10
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 244, in test_inference_pipelining_without_interleaving
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 349, in forward_backward_pipelining_without_interleaving
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     send_forward(
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 145, in send_forward
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.send_forward(
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 401, in send_forward
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     _communicate(
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.send([tensor], group_dst, tag)
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-G3jTgB (size 8257920), error: No space left on device (28)
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_without_interleaving
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank1]:E0603 16:13:52.303000 57798 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 1 with exit code: 10
-[dist init] rank = 1, world_size = 8
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 244, in test_inference_pipelining_without_interleaving
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 332, in forward_backward_pipelining_without_interleaving
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = recv_forward(
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-UuQrwh (size 8257920), error: No space left on device (28)
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_without_interleaving
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank3]:E0603 16:13:52.304000 57800 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 3 with exit code: 10
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 244, in test_inference_pipelining_without_interleaving
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 367, in forward_backward_pipelining_without_interleaving
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor: List[Union[None, torch.Tensor, FutureTensor]] = recv_forward(tensor_shapes=recv_tensor_shapes, dtype=dtype, async_comm=async_comm)
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-SMCF6k (size 8257920), error: No space left on device (28)
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_without_interleaving
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank7]:E0603 16:13:52.304000 57804 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 7 with exit code: 10
-[dist init] rank = 3, world_size = 8
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 244, in test_inference_pipelining_without_interleaving
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 332, in forward_backward_pipelining_without_interleaving
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = recv_forward(
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-eVwAPm (size 8257920), error: No space left on device (28)
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_without_interleaving
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank5]:E0603 16:13:52.304000 57802 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 5 with exit code: 10
-[dist init] rank = 7, world_size = 8
-[dist init] rank = 5, world_size = 8
-[rank2]:[W603 16:13:52.645901574 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank4]:[W603 16:13:52.659257967 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank0]:[W603 16:13:52.771542944 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank6]:[W603 16:13:52.781064435 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank1]:[W603 16:13:53.269871734 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank3]:[W603 16:13:53.278681750 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank7]:[W603 16:13:53.365874516 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank5]:[W603 16:13:53.374848385 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-ERROR
-test_inference_pipelining_without_interleaving_ucc_for_p2p (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest) ... Process 1 terminated with exit code 10, terminating remaining processes.
-[rank0]:[W603 16:14:15.213860590 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank6]:[W603 16:14:15.229402887 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank4]:[W603 16:14:15.237331454 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:14:15.240780322 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank7]:[W603 16:14:15.246388306 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:14:15.251613170 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank5]:[W603 16:14:15.262122898 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:14:15.264435740 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[dist init] rank = 0, world_size = 8
-[dist init] rank = 3, world_size = 8
-[dist init] rank = 4, world_size = 8
-[dist init] rank = 5, world_size = 8
-[dist init] rank = 2, world_size = 8
-[dist init] rank = 7, world_size = 8
-[dist init] rank = 1, world_size = 8
-[dist init] rank = 6, world_size = 8
-[rank0]:[W603 16:14:24.075517383 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-skipped 'Test skipped at subprocess level, look at subprocess log for skip reason'
-test_learning_async_pipelining_with_interleaving (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest) ... [rank0]:[W603 16:14:36.025146016 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:14:37.810567302 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:14:37.830866809 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:14:37.831099595 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank4]:[W603 16:14:37.833581789 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank7]:[W603 16:14:37.850227383 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank6]:[W603 16:14:37.852124078 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank5]:[W603 16:14:37.852242875 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[dist init] rank = 4, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
-[dist init] rank = 2, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
-  warnings.warn("This function is only for unittest")
-[dist init] rank = 6, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
-[dist init] rank = 0, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
-Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
-  self.assertEqual(x.item() / microbatch_size, target_loss.item())
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
-Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
-  self.assertEqual(x.item() / microbatch_size, target_loss.item())
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 272, in test_learning_async_pipelining_with_interleaving
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 276, in _forward_backward_pipelining_with_interleaving
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = p2p_communication.send_forward_recv_forward(
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 515, in send_forward_recv_forward
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.send([tensor], group_dst, tag)
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-BS3aqb (size 8257920), error: No space left on device (28)
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_with_interleaving
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank0]:E0603 16:16:56.226000 62443 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 0 with exit code: 10
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 272, in test_learning_async_pipelining_with_interleaving
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-vuRJwT (size 8257920), error: No space left on device (28)
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_with_interleaving
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank2]:E0603 16:16:56.226000 62445 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 2 with exit code: 10
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 272, in test_learning_async_pipelining_with_interleaving
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-nVYHSK (size 8257920), error: No space left on device (28)
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_with_interleaving
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank4]:E0603 16:16:56.226000 62447 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 4 with exit code: 10
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 272, in test_learning_async_pipelining_with_interleaving
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-kcfrRq (size 8257920), error: No space left on device (28)
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_with_interleaving
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank6]:E0603 16:16:56.226000 62449 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 6 with exit code: 10
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 272, in test_learning_async_pipelining_with_interleaving
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-fTb69h (size 8257920), error: No space left on device (28)
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_with_interleaving
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank5]:E0603 16:16:56.348000 62448 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 5 with exit code: 10
-[dist init] rank = 5, world_size = 8
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 272, in test_learning_async_pipelining_with_interleaving
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-nWTLTe (size 8257920), error: No space left on device (28)
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_with_interleaving
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank7]:E0603 16:16:56.349000 62450 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 7 with exit code: 10
-[dist init] rank = 7, world_size = 8
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 272, in test_learning_async_pipelining_with_interleaving
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 276, in _forward_backward_pipelining_with_interleaving
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = p2p_communication.send_forward_recv_forward(
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 515, in send_forward_recv_forward
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.send([tensor], group_dst, tag)
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-2Un7eJ (size 8257920), error: No space left on device (28)
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_with_interleaving
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank1]:E0603 16:16:56.349000 62444 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 1 with exit code: 10
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 272, in test_learning_async_pipelining_with_interleaving
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-ij6ANE (size 8257920), error: No space left on device (28)
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_with_interleaving
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank3]:E0603 16:16:56.349000 62446 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 3 with exit code: 10
-[dist init] rank = 1, world_size = 8
-[dist init] rank = 3, world_size = 8
-[rank6]:[W603 16:16:56.562566005 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank0]:[W603 16:16:56.730181481 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank4]:[W603 16:16:56.736195096 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank2]:[W603 16:16:56.754515015 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank1]:[W603 16:16:57.298347870 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank3]:[W603 16:16:57.308824739 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank7]:[W603 16:16:57.335671324 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank5]:[W603 16:16:57.387429570 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-ERROR
-test_learning_async_pipelining_without_interleaving (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest) ... Process 6 terminated with exit code 10, terminating remaining processes.
-[rank6]:[W603 16:17:20.110087992 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:17:20.142174903 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank4]:[W603 16:17:20.152173974 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank5]:[W603 16:17:20.155374203 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank7]:[W603 16:17:20.171188554 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:17:20.175718717 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:17:20.180679259 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:17:20.252079828 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
-  warnings.warn("This function is only for unittest")
-[dist init] rank = 6, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
-[dist init] rank = 0, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
-[dist init] rank = 4, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
-[dist init] rank = 2, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
-Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
-  self.assertEqual(x.item() / microbatch_size, target_loss.item())
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
-Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
-  self.assertEqual(x.item() / microbatch_size, target_loss.item())
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 249, in test_learning_async_pipelining_without_interleaving
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 332, in forward_backward_pipelining_without_interleaving
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = recv_forward(
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-gwbLVE (size 8257920), error: No space left on device (28)
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_without_interleaving
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank2]:E0603 16:19:14.430000 66590 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 2 with exit code: 10
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 249, in test_learning_async_pipelining_without_interleaving
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 332, in forward_backward_pipelining_without_interleaving
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = recv_forward(
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-gP7wMi (size 8257920), error: No space left on device (28)
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_without_interleaving
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank4]:E0603 16:19:14.430000 66592 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 4 with exit code: 10
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 249, in test_learning_async_pipelining_without_interleaving
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 332, in forward_backward_pipelining_without_interleaving
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = recv_forward(
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-EwoJJ3 (size 8257920), error: No space left on device (28)
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_without_interleaving
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank5]:E0603 16:19:14.430000 66593 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 5 with exit code: 10
-[dist init] rank = 5, world_size = 8
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 249, in test_learning_async_pipelining_without_interleaving
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 367, in forward_backward_pipelining_without_interleaving
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor: List[Union[None, torch.Tensor, FutureTensor]] = recv_forward(tensor_shapes=recv_tensor_shapes, dtype=dtype, async_comm=async_comm)
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-bTPujE (size 8257920), error: No space left on device (28)
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_without_interleaving
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank7]:E0603 16:19:14.431000 66595 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 7 with exit code: 10
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 249, in test_learning_async_pipelining_without_interleaving
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 349, in forward_backward_pipelining_without_interleaving
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     send_forward(
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 145, in send_forward
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.send_forward(
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 401, in send_forward
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     _communicate(
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.send([tensor], group_dst, tag)
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-OSTKDz (size 8257920), error: No space left on device (28)
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_without_interleaving
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank1]:E0603 16:19:14.431000 66589 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 1 with exit code: 10
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 249, in test_learning_async_pipelining_without_interleaving
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 332, in forward_backward_pipelining_without_interleaving
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = recv_forward(
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-IVeBCr (size 8257920), error: No space left on device (28)
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_without_interleaving
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank3]:E0603 16:19:14.432000 66591 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 3 with exit code: 10
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 249, in test_learning_async_pipelining_without_interleaving
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 367, in forward_backward_pipelining_without_interleaving
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor: List[Union[None, torch.Tensor, FutureTensor]] = recv_forward(tensor_shapes=recv_tensor_shapes, dtype=dtype, async_comm=async_comm)
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-DLYrhm (size 8257920), error: No space left on device (28)
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_without_interleaving
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank6]:E0603 16:19:14.432000 66594 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 6 with exit code: 10
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 249, in test_learning_async_pipelining_without_interleaving
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 349, in forward_backward_pipelining_without_interleaving
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     send_forward(
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 145, in send_forward
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.send_forward(
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 401, in send_forward
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     _communicate(
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.send([tensor], group_dst, tag)
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-5FNsgR (size 8257920), error: No space left on device (28)
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_without_interleaving
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank0]:E0603 16:19:14.432000 66588 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 0 with exit code: 10
-[dist init] rank = 1, world_size = 8
-[dist init] rank = 7, world_size = 8
-[dist init] rank = 3, world_size = 8
-[rank6]:[W603 16:19:14.744460635 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank2]:[W603 16:19:14.746772096 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank4]:[W603 16:19:14.760300394 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank0]:[W603 16:19:14.772900364 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank1]:[W603 16:19:15.356409455 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank3]:[W603 16:19:15.363179882 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank7]:[W603 16:19:15.494206214 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank5]:[W603 16:19:15.494956748 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-ERROR
-test_learning_no_pipelining (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest) ... Process 1 terminated with exit code 10, terminating remaining processes.
-[rank5]:[W603 16:19:37.713920562 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:19:37.722140622 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank4]:[W603 16:19:37.724227198 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank7]:[W603 16:19:37.730407901 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:19:37.734493395 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:19:37.907402606 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank6]:[W603 16:19:37.929812315 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:19:37.930026984 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
-  warnings.warn("This function is only for unittest")
-[dist init] rank = 0, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
-Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
-  self.assertEqual(x.item() / microbatch_size, target_loss.item())
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
-Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
-  self.assertEqual(x.item() / microbatch_size, target_loss.item())
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
-Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
-  self.assertEqual(x.item() / microbatch_size, target_loss.item())
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
-Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
-  self.assertEqual(x.item() / microbatch_size, target_loss.item())
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
-Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
-  self.assertEqual(x.item() / microbatch_size, target_loss.item())
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
-Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
-  self.assertEqual(x.item() / microbatch_size, target_loss.item())
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
-Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
-  self.assertEqual(x.item() / microbatch_size, target_loss.item())
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
-Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
-  self.assertEqual(x.item() / microbatch_size, target_loss.item())
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
-[dist init] rank = 7, world_size = 8
-[dist init] rank = 6, world_size = 8
-[dist init] rank = 2, world_size = 8
-[dist init] rank = 5, world_size = 8
-[dist init] rank = 4, world_size = 8
-[dist init] rank = 1, world_size = 8
-[dist init] rank = 3, world_size = 8
-ok
-test_learning_pipelining_with_interleaving (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest) ... [rank1]:[W603 16:21:13.240201491 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:21:13.249418440 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank7]:[W603 16:21:13.249418210 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank5]:[W603 16:21:13.273189500 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:21:13.275414472 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank6]:[W603 16:21:13.276441496 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:21:13.295509256 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank4]:[W603 16:21:13.323185761 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[dist init] rank = 4, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
-  warnings.warn("This function is only for unittest")
-[dist init] rank = 2, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
-[dist init] rank = 6, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
-[dist init] rank = 0, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
-Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
-  self.assertEqual(x.item() / microbatch_size, target_loss.item())
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
-Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
-  self.assertEqual(x.item() / microbatch_size, target_loss.item())
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 2112
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 2112
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 260, in test_learning_pipelining_with_interleaving
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-g8m58H (size 8257920), error: No space left on device (28)
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_with_interleaving
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank7]:E0603 16:23:37.606000 74341 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 7 with exit code: 10
-[dist init] rank = 7, world_size = 8
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 260, in test_learning_pipelining_with_interleaving
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 276, in _forward_backward_pipelining_with_interleaving
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = p2p_communication.send_forward_recv_forward(
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 515, in send_forward_recv_forward
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.send([tensor], group_dst, tag)
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-ANcY0t (size 8257920), error: No space left on device (28)
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_with_interleaving
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank1]:E0603 16:23:37.607000 74335 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 1 with exit code: 10
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 260, in test_learning_pipelining_with_interleaving
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-UrziMF (size 8257920), error: No space left on device (28)
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_with_interleaving
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank3]:E0603 16:23:37.608000 74337 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 3 with exit code: 10
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 260, in test_learning_pipelining_with_interleaving
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-jPvPWC (size 8257920), error: No space left on device (28)
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_with_interleaving
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank5]:E0603 16:23:37.608000 74339 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 5 with exit code: 10
-[dist init] rank = 1, world_size = 8
-[dist init] rank = 3, world_size = 8
-[dist init] rank = 5, world_size = 8
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 260, in test_learning_pipelining_with_interleaving
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 276, in _forward_backward_pipelining_with_interleaving
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = p2p_communication.send_forward_recv_forward(
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 515, in send_forward_recv_forward
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.send([tensor], group_dst, tag)
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-jjObkm (size 8257920), error: No space left on device (28)
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_with_interleaving
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank0]:E0603 16:23:37.656000 74334 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 0 with exit code: 10
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 260, in test_learning_pipelining_with_interleaving
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-IoM3re (size 8257920), error: No space left on device (28)
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_with_interleaving
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank4]:E0603 16:23:37.657000 74338 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 4 with exit code: 10
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 260, in test_learning_pipelining_with_interleaving
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-tJxE3b (size 8257920), error: No space left on device (28)
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_with_interleaving
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank2]:E0603 16:23:37.657000 74336 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 2 with exit code: 10
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 260, in test_learning_pipelining_with_interleaving
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-4zQyZf (size 8257920), error: No space left on device (28)
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_with_interleaving
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank6]:E0603 16:23:37.659000 74340 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 6 with exit code: 10
-[rank2]:[W603 16:23:37.958873085 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank0]:[W603 16:23:37.997044803 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank4]:[W603 16:23:37.998627199 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank6]:[W603 16:23:38.116992402 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank3]:[W603 16:23:38.516222369 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank5]:[W603 16:23:38.541465240 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank1]:[W603 16:23:38.748065229 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank7]:[W603 16:23:38.794640305 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-ERROR
-test_learning_pipelining_without_interleaving (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest) ... Process 3 terminated with exit code 10, terminating remaining processes.
-[rank0]:[W603 16:24:01.300922058 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:24:01.310179142 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank4]:[W603 16:24:01.313455880 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:24:02.201775634 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank6]:[W603 16:24:02.208507283 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:24:02.208962478 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank7]:[W603 16:24:02.209742284 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank5]:[W603 16:24:02.219031365 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
-  warnings.warn("This function is only for unittest")
-[dist init] rank = 4, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
-[dist init] rank = 0, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
-[dist init] rank = 2, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
-[dist init] rank = 6, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
-Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
-  self.assertEqual(x.item() / microbatch_size, target_loss.item())
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:208: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
-Consider using tensor.detach() first. (Triggered internally at /skishore/github/pytorch/aten/src/ATen/native/Scalar.cpp:22.)
-  self.assertEqual(x.item() / microbatch_size, target_loss.item())
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
-/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py:126: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
-  torch.cuda.amp.GradScaler(init_scale=4.0)
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 1056
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1056
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 239, in test_learning_pipelining_without_interleaving
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 332, in forward_backward_pipelining_without_interleaving
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = recv_forward(
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-2fZGs1 (size 8257920), error: No space left on device (28)
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_without_interleaving
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank5]:E0603 16:26:09.377000 78484 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 5 with exit code: 10
-[dist init] rank = 5, world_size = 8
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 239, in test_learning_pipelining_without_interleaving
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 332, in forward_backward_pipelining_without_interleaving
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = recv_forward(
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-GPa5Ic (size 8257920), error: No space left on device (28)
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_without_interleaving
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank3]:E0603 16:26:09.378000 78482 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 3 with exit code: 10
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 239, in test_learning_pipelining_without_interleaving
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 349, in forward_backward_pipelining_without_interleaving
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     send_forward(
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 145, in send_forward
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.send_forward(
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 401, in send_forward
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     _communicate(
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.send([tensor], group_dst, tag)
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-abqPSD (size 8257920), error: No space left on device (28)
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_without_interleaving
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank1]:E0603 16:26:09.378000 78480 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 1 with exit code: 10
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 239, in test_learning_pipelining_without_interleaving
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 367, in forward_backward_pipelining_without_interleaving
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor: List[Union[None, torch.Tensor, FutureTensor]] = recv_forward(tensor_shapes=recv_tensor_shapes, dtype=dtype, async_comm=async_comm)
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-PAeuzA (size 8257920), error: No space left on device (28)
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_without_interleaving
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank7]:E0603 16:26:09.378000 78486 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 7 with exit code: 10
-[dist init] rank = 1, world_size = 8
-[dist init] rank = 3, world_size = 8
-[dist init] rank = 7, world_size = 8
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 239, in test_learning_pipelining_without_interleaving
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 367, in forward_backward_pipelining_without_interleaving
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor: List[Union[None, torch.Tensor, FutureTensor]] = recv_forward(tensor_shapes=recv_tensor_shapes, dtype=dtype, async_comm=async_comm)
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-UuuwZX (size 8257920), error: No space left on device (28)
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_without_interleaving
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank6]:E0603 16:26:10.117000 78485 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 6 with exit code: 10
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 239, in test_learning_pipelining_without_interleaving
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 332, in forward_backward_pipelining_without_interleaving
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = recv_forward(
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-jvUOv9 (size 8257920), error: No space left on device (28)
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_without_interleaving
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank2]:E0603 16:26:10.118000 78481 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 2 with exit code: 10
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 239, in test_learning_pipelining_without_interleaving
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 332, in forward_backward_pipelining_without_interleaving
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor = recv_forward(
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 99, in recv_forward
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.recv_forward(
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     input_tensor, _ = _communicate(
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.recv([tensor], group_src, tag)
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-auyogM (size 8257920), error: No space left on device (28)
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_without_interleaving
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank4]:E0603 16:26:10.118000 78483 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 4 with exit code: 10
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Caught exception: 
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Traceback (most recent call last):
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     getattr(self, test_name)()
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     fn()
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     method(*args, **kwargs)
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 239, in test_learning_pipelining_without_interleaving
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     self._forward_backward_test_impl(
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     loss = fwd_bwd_func(
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 349, in forward_backward_pipelining_without_interleaving
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     send_forward(
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 145, in send_forward
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_communication.send_forward(
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 401, in send_forward
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     _communicate(
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     reqs = torch.distributed.batch_isend_irecv(ops)
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     p2p_op.op(
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]   File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     return group.send([tensor], group_dst, tag)
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Last error:
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] Error while creating shared memory segment /dev/shm/nccl-bYNB7m (size 8257920), error: No space left on device (28)
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] To execute this test, run the following from the base repo dir:
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]     PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_without_interleaving
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] 
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-[rank0]:E0603 16:26:10.120000 78479 /skishore/github/pytorch/torch/testing/_internal/common_distributed.py:741]  exiting process 0 with exit code: 10
-[rank5]:[W603 16:26:10.298413670 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank7]:[W603 16:26:10.307117426 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank3]:[W603 16:26:10.349196783 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank1]:[W603 16:26:10.354651742 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank6]:[W603 16:26:10.420917412 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank2]:[W603 16:26:10.432103319 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank4]:[W603 16:26:10.444288648 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank0]:[W603 16:26:10.468277920 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-ERROR
-test_learning_pipelining_without_interleaving_ucc_for_p2p (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest) ... Process 1 terminated with exit code 10, terminating remaining processes.
-[rank3]:[W603 16:26:32.370430829 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:26:32.374126929 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:26:32.374865154 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank7]:[W603 16:26:33.111700547 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank4]:[W603 16:26:33.222152363 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:26:33.230871453 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank5]:[W603 16:26:33.234719208 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank6]:[W603 16:26:33.237089914 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[dist init] rank = 7, world_size = 8
-[dist init] rank = 5, world_size = 8
-[dist init] rank = 1, world_size = 8
-[dist init] rank = 2, world_size = 8
-[dist init] rank = 6, world_size = 8
-[dist init] rank = 3, world_size = 8
-[dist init] rank = 4, world_size = 8
-[dist init] rank = 0, world_size = 8
-[rank0]:[W603 16:26:45.905335399 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-skipped 'Test skipped at subprocess level, look at subprocess log for skip reason'
-test_pipelining_without_interleaving_encoder_and_decoder (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelWithToyParallelMLP) ... [rank1]:[W603 16:26:59.114654676 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:26:59.138041697 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:26:59.151866578 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank5]:[W603 16:26:59.153898992 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank6]:[W603 16:26:59.155885838 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank7]:[W603 16:26:59.156481882 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank4]:[W603 16:26:59.159510874 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:26:59.164376950 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
-  warnings.warn("This function is only for unittest")
-[dist init] rank = 5, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (1, 2): 16576
-[dist init] rank = 3, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (1, 1): 16576
-[dist init] rank = 7, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (1, 3): 16576
-[dist init] rank = 4, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 16576
-[dist init] rank = 6, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 16576
-[dist init] rank = 0, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 16576
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-[dist init] rank = 2, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 16576
-[dist init] rank = 1, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 16576
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-ok
-test_pipelining_without_interleaving_encoder_or_decoder (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelWithToyParallelMLP) ... [rank6]:[W603 16:28:16.813641713 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank7]:[W603 16:28:16.822877040 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:28:16.822896689 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:28:16.826799597 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:28:16.830679000 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:28:16.831445467 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank5]:[W603 16:28:16.831964727 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank4]:[W603 16:28:16.832049082 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
-  warnings.warn("This function is only for unittest")
-[dist init] rank = 7, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (1, 3): 16576
-[dist init] rank = 3, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (1, 1): 16576
-[dist init] rank = 0, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 16576
-[dist init] rank = 4, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 16576
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-[dist init] rank = 1, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 16576
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-[dist init] rank = 6, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 16576
-[dist init] rank = 2, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 16576
-[dist init] rank = 5, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (1, 2): 16576
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-ok
-test_pipelining_without_interleaving_inferenc_encoder_and_decoder (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelWithToyParallelMLP) ... [rank5]:[W603 16:29:58.144385859 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:29:58.153746813 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank4]:[W603 16:29:58.154014020 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:29:58.166112505 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank7]:[W603 16:29:58.167303591 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:29:58.244365810 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:29:58.249077668 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank6]:[W603 16:29:58.250782958 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
-  warnings.warn("This function is only for unittest")
-[dist init] rank = 6, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 16576
-[dist init] rank = 7, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (1, 3): 16576
-[dist init] rank = 5, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (1, 2): 16576
-[dist init] rank = 1, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 16576
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-[dist init] rank = 2, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 16576
-[dist init] rank = 3, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (1, 1): 16576
-[dist init] rank = 0, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 16576
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-[dist init] rank = 4, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 16576
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-ok
-test_pipelining_without_interleaving_inference_sequence_paralle_encoder_and_decoder (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelWithToyParallelMLP) ... [rank5]:[W603 16:30:36.818185801 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:30:36.820888440 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank4]:[W603 16:30:36.822907925 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:30:36.834983897 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank6]:[W603 16:30:36.911624183 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank7]:[W603 16:30:36.919864365 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:30:37.098846840 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:30:37.101206801 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
-  warnings.warn("This function is only for unittest")
-[dist init] rank = 5, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (1, 2): 16576
-[dist init] rank = 6, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 16576
-[dist init] rank = 4, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 16576
-[dist init] rank = 0, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 16576
-[dist init] rank = 7, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (1, 3): 16576
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-[dist init] rank = 3, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (1, 1): 16576
-[dist init] rank = 1, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 16576
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-[dist init] rank = 2, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 16576
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-ok
-test_pipelining_without_interleaving_sequence_paralle_encoder_and_decoder (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelWithToyParallelMLP) ... [rank7]:[W603 16:31:14.282970134 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:31:14.397246689 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank4]:[W603 16:31:14.401349996 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:31:14.402028883 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank5]:[W603 16:31:14.403024530 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:31:14.405795309 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:31:14.472123643 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank6]:[W603 16:31:14.485535778 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
-  warnings.warn("This function is only for unittest")
-[dist init] rank = 6, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 16576
-[dist init] rank = 4, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 16576
-[dist init] rank = 7, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (1, 3): 16576
-[dist init] rank = 0, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 16576
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-[dist init] rank = 2, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 16576
-[dist init] rank = 1, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 16576
-[dist init] rank = 3, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (1, 1): 16576
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-[dist init] rank = 5, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (1, 2): 16576
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  handle = torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  handle = torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  handle = torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  handle = torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  handle = torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  handle = torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  handle = torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  handle = torch.distributed._reduce_scatter_base(
-ok
-test_pipelining_without_interleaving_sequence_parallel_encoder_or_decoder (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelWithToyParallelMLP) ... [rank0]:[W603 16:32:27.723335566 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:32:27.726510125 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:32:27.735554250 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:32:27.735853684 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank4]:[W603 16:32:27.739658569 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank5]:[W603 16:32:27.747217020 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank7]:[W603 16:32:27.755008258 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank6]:[W603 16:32:27.755425907 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
-  warnings.warn("This function is only for unittest")
-[dist init] rank = 1, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 16576
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-[dist init] rank = 7, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (1, 3): 16576
-[dist init] rank = 5, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (1, 2): 16576
-[dist init] rank = 2, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 16576
-[dist init] rank = 4, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 16576
-[dist init] rank = 6, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 16576
-[dist init] rank = 0, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 16576
-[dist init] rank = 3, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (1, 1): 16576
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  handle = torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  handle = torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  handle = torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  handle = torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  handle = torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  handle = torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  handle = torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  handle = torch.distributed._reduce_scatter_base(
-ok
-test_pipelining_without_interleaving_sequence_parallel_encoder_or_decoder_half (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelWithToyParallelMLP) ... [rank2]:[W603 16:33:41.746748332 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank4]:[W603 16:33:41.767579754 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:33:41.769077596 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank6]:[W603 16:33:41.770638701 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:33:42.345720967 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank7]:[W603 16:33:42.362124551 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank5]:[W603 16:33:42.378914949 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:33:42.380083462 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
-  warnings.warn("This function is only for unittest")
-[dist init] rank = 6, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 3): 16576
-[dist init] rank = 4, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 2): 16576
-[dist init] rank = 0, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 16576
-[dist init] rank = 7, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (1, 3): 16576
-[dist init] rank = 2, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 16576
-[dist init] rank = 3, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (1, 1): 16576
-[dist init] rank = 5, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (1, 2): 16576
-[dist init] rank = 1, world_size = 8
- > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 16576
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/_autocast_utils.py:26: FutureWarning: `torch.cuda.amp.autocast_mode._cast(value, dtype)` is deprecated. Please use `torch.amp.autocast_mode._cast(value, 'cuda', dtype)` instead.
-  return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/_autocast_utils.py:26: FutureWarning: `torch.cuda.amp.autocast_mode._cast(value, dtype)` is deprecated. Please use `torch.amp.autocast_mode._cast(value, 'cuda', dtype)` instead.
-  return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/_autocast_utils.py:26: FutureWarning: `torch.cuda.amp.autocast_mode._cast(value, dtype)` is deprecated. Please use `torch.amp.autocast_mode._cast(value, 'cuda', dtype)` instead.
-  return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/_autocast_utils.py:26: FutureWarning: `torch.cuda.amp.autocast_mode._cast(value, dtype)` is deprecated. Please use `torch.amp.autocast_mode._cast(value, 'cuda', dtype)` instead.
-  return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/_autocast_utils.py:26: FutureWarning: `torch.cuda.amp.autocast_mode._cast(value, dtype)` is deprecated. Please use `torch.amp.autocast_mode._cast(value, 'cuda', dtype)` instead.
-  return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/_autocast_utils.py:26: FutureWarning: `torch.cuda.amp.autocast_mode._cast(value, dtype)` is deprecated. Please use `torch.amp.autocast_mode._cast(value, 'cuda', dtype)` instead.
-  return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/_autocast_utils.py:26: FutureWarning: `torch.cuda.amp.autocast_mode._cast(value, dtype)` is deprecated. Please use `torch.amp.autocast_mode._cast(value, 'cuda', dtype)` instead.
-  return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/_autocast_utils.py:26: FutureWarning: `torch.cuda.amp.autocast_mode._cast(value, dtype)` is deprecated. Please use `torch.amp.autocast_mode._cast(value, 'cuda', dtype)` instead.
-  return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())
-/skishore/github/pytorch/torch/distributed/c10d_logger.py:81: FutureWarning: `torch.distributed._all_gather_base` is a private function and will be deprecated. Please use `torch.distributed.all_gather_into_tensor` instead.
-  return func(*args, **kwargs)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/mappings.py:125: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  handle = torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  handle = torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  handle = torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  handle = torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  handle = torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  handle = torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  handle = torch.distributed._reduce_scatter_base(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/layers.py:358: FutureWarning: `torch.distributed._reduce_scatter_base` is a private function and will be deprecated. Please use `torch.distributed.reduce_scatter_tensor` instead.
-  handle = torch.distributed._reduce_scatter_base(
-ok
-test_inference_async_pipelining_with_interleaving (test_pipeline_parallel_fwd_bwd.UccPipelineParallelForwardBackwardTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_inference_async_pipelining_without_interleaving (test_pipeline_parallel_fwd_bwd.UccPipelineParallelForwardBackwardTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_inference_no_pipelining (test_pipeline_parallel_fwd_bwd.UccPipelineParallelForwardBackwardTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_inference_pipelining_with_interleaving (test_pipeline_parallel_fwd_bwd.UccPipelineParallelForwardBackwardTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_inference_pipelining_without_interleaving (test_pipeline_parallel_fwd_bwd.UccPipelineParallelForwardBackwardTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_learning_async_pipelining_with_interleaving (test_pipeline_parallel_fwd_bwd.UccPipelineParallelForwardBackwardTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_learning_async_pipelining_without_interleaving (test_pipeline_parallel_fwd_bwd.UccPipelineParallelForwardBackwardTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_learning_no_pipelining (test_pipeline_parallel_fwd_bwd.UccPipelineParallelForwardBackwardTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_learning_pipelining_with_interleaving (test_pipeline_parallel_fwd_bwd.UccPipelineParallelForwardBackwardTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_learning_pipelining_without_interleaving (test_pipeline_parallel_fwd_bwd.UccPipelineParallelForwardBackwardTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_cuda_rng_tracker (test_random.NcclTransformerRandomTest) ... [rank0]:[W603 16:34:39.248877851 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:34:39.259966901 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:34:39.268915315 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:34:39.272702934 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-/skishore/github/apex/tests/L0/run_transformer/test_random.py:73: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
-  tensor = torch.cuda.FloatTensor(size)
-/skishore/github/apex/tests/L0/run_transformer/test_random.py:73: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
-  tensor = torch.cuda.FloatTensor(size)
-/skishore/github/apex/tests/L0/run_transformer/test_random.py:73: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
-  tensor = torch.cuda.FloatTensor(size)
-/skishore/github/apex/tests/L0/run_transformer/test_random.py:73: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
-  tensor = torch.cuda.FloatTensor(size)
-[dist init] rank = 0, world_size = 4
-[dist init] rank = 1, world_size = 4
-[dist init] rank = 3, world_size = 4
-[dist init] rank = 2, world_size = 4
-ok
-test_set_cuda_rng_state (test_random.NcclTransformerRandomTest) ... [rank1]:[W603 16:34:54.482892089 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:34:54.488903577 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:34:54.490081995 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:34:54.970732545 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-/skishore/github/apex/tests/L0/run_transformer/test_random.py:30: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
-  tensor = torch.cuda.FloatTensor(size)
-/skishore/github/apex/tests/L0/run_transformer/test_random.py:30: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
-  tensor = torch.cuda.FloatTensor(size)
-/skishore/github/apex/tests/L0/run_transformer/test_random.py:30: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
-  tensor = torch.cuda.FloatTensor(size)
-/skishore/github/apex/tests/L0/run_transformer/test_random.py:30: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
-  tensor = torch.cuda.FloatTensor(size)
-[dist init] rank = 0, world_size = 4
-[dist init] rank = 3, world_size = 4
-[dist init] rank = 1, world_size = 4
-[dist init] rank = 2, world_size = 4
-ok
-test_cuda_rng_tracker (test_random.UccTransformerRandomTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_set_cuda_rng_state (test_random.UccTransformerRandomTest) ... skipped 'Requires [`torch_ucc`](https://github.com/facebookresearch/torch_ucc)'
-test_transformer (test_transformer_module.TestTransformer) ... 
-*****************************************
-Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
-*****************************************
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/skishore/github/pytorch/torch/autograd/graph.py:824: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at /skishore/github/pytorch/torch/csrc/autograd/autograd_not_implemented_fallback.cpp:62.)
-  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
-/skishore/github/pytorch/torch/autograd/graph.py:824: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at /skishore/github/pytorch/torch/csrc/autograd/autograd_not_implemented_fallback.cpp:62.)
-  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
-/skishore/github/pytorch/torch/autograd/graph.py:824: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at /skishore/github/pytorch/torch/csrc/autograd/autograd_not_implemented_fallback.cpp:62.)
-  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
-/skishore/github/pytorch/torch/autograd/graph.py:824: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at /skishore/github/pytorch/torch/csrc/autograd/autograd_not_implemented_fallback.cpp:62.)
-  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
-[rank7]:[W603 16:37:20.454816579 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank6]:[W603 16:37:20.459721136 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank4]:[W603 16:37:20.467136577 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank5]:[W603 16:37:20.469727852 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:37:20.475567385 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:37:20.479225452 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:37:20.489246729 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:37:20.490170971 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:37:35.197418010 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank3]:[W603 16:37:35.202581912 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank1]:[W603 16:37:35.214856702 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank2]:[W603 16:37:35.232038583 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank4]:[W603 16:37:35.235840664 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank5]:[W603 16:37:35.274117862 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank7]:[W603 16:37:35.278545080 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank6]:[W603 16:37:35.302280844 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-
-*****************************************
-Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
-*****************************************
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/data.py:50: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
-  sizes_cuda = torch.cuda.LongTensor(sizes)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/data.py:50: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
-  sizes_cuda = torch.cuda.LongTensor(sizes)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/data.py:50: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
-  sizes_cuda = torch.cuda.LongTensor(sizes)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/data.py:50: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
-  sizes_cuda = torch.cuda.LongTensor(sizes)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/data.py:50: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
-  sizes_cuda = torch.cuda.LongTensor(sizes)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/data.py:50: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
-  sizes_cuda = torch.cuda.LongTensor(sizes)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/data.py:50: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
-  sizes_cuda = torch.cuda.LongTensor(sizes)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/tensor_parallel/data.py:50: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /skishore/github/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
-  sizes_cuda = torch.cuda.LongTensor(sizes)
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/skishore/github/pytorch/torch/autograd/graph.py:824: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at /skishore/github/pytorch/torch/csrc/autograd/autograd_not_implemented_fallback.cpp:62.)
-  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
-/skishore/github/pytorch/torch/autograd/graph.py:824: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at /skishore/github/pytorch/torch/csrc/autograd/autograd_not_implemented_fallback.cpp:62.)
-  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
-/skishore/github/pytorch/torch/autograd/graph.py:824: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at /skishore/github/pytorch/torch/csrc/autograd/autograd_not_implemented_fallback.cpp:62.)
-  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
-/skishore/github/pytorch/torch/autograd/graph.py:824: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at /skishore/github/pytorch/torch/csrc/autograd/autograd_not_implemented_fallback.cpp:62.)
-  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
-[rank7]:[W603 16:43:01.663759187 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank6]:[W603 16:43:01.663999103 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank5]:[W603 16:43:01.760702253 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank4]:[W603 16:43:01.760939295 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank3]:[W603 16:43:01.779388598 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:43:01.780375512 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:43:01.798539673 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:43:01.798610107 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:43:09.017878787 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank6]:[W603 16:43:10.036188324 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank4]:[W603 16:43:10.052884716 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank7]:[W603 16:43:10.062708111 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank1]:[W603 16:43:10.063044120 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank3]:[W603 16:43:10.064505398 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank5]:[W603 16:43:10.066178591 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank0]:[W603 16:43:10.071842795 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-
-*****************************************
-Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
-*****************************************
-[rank3]:[W603 16:43:35.670623763 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank7]:[W603 16:43:35.670765383 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:43:35.672929394 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank5]:[W603 16:43:35.676482316 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
-  warnings.warn("This function is only for unittest")
-[rank0]:[W603 16:43:35.691670551 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank6]:[W603 16:43:35.699256296 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:43:36.063533624 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank4]:[W603 16:43:36.073806186 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/common.py:295: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
-  with torch.cuda.amp.autocast(
-/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/utils.py:81: UserWarning: This function is only for unittest
-  warnings.warn("This function is only for unittest")
-[rank0]:[W603 16:46:11.098930997 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank4]:[W603 16:46:11.115717975 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank2]:[W603 16:46:11.119934281 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank6]:[W603 16:46:11.123500132 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank1]:[W603 16:46:11.125988935 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank5]:[W603 16:46:11.131786406 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank3]:[W603 16:46:11.133387652 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-[rank7]:[W603 16:46:11.135553105 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
-ok
-test_split_tensor_along_last_dim (test_transformer_utils.TransformerUtilsTest) ... #######################################################
-# Python executable path: /opt/conda/envs/py_3.10/bin/python
-# 3 tests: ['/skishore/github/apex/tests/L0/run_transformer/run_gpt_minimal_test.py', '/skishore/github/apex/tests/L0/run_transformer/run_bert_minimal_test.py', '/skishore/github/apex/tests/L0/run_transformer/run_dynamic_batchsize_test.py']
-#######################################################
-### 1 / 3: cmd: /opt/conda/envs/py_3.10/bin/python -m torch.distributed.run --nproc_per_node=8 /skishore/github/apex/tests/L0/run_transformer/run_gpt_minimal_test.py --micro-batch-size 2 --num-layers 16 --hidden-size 256 --num-attention-heads 8 --max-position-embeddings 512 --seq-length 512 --global-batch-size 128 --pipeline-model-parallel-size 4 --tensor-model-parallel-size 2
-### 2 / 3: cmd: /opt/conda/envs/py_3.10/bin/python -m torch.distributed.run --nproc_per_node=8 /skishore/github/apex/tests/L0/run_transformer/run_bert_minimal_test.py --micro-batch-size 2 --num-layers 16 --hidden-size 256 --num-attention-heads 8 --max-position-embeddings 512 --seq-length 512 --global-batch-size 128 --pipeline-model-parallel-size 4 --tensor-model-parallel-size 2 --bert-no-binary-head
-### 3 / 3: cmd: /opt/conda/envs/py_3.10/bin/python -m torch.distributed.run --nproc_per_node=8 /skishore/github/apex/tests/L0/run_transformer/run_dynamic_batchsize_test.py --micro-batch-size 2 --num-layers 16 --hidden-size 256 --num-attention-heads 8 --max-position-embeddings 512 --seq-length 512 --global-batch-size 128 --use-cpu-initialization
-### PASSED
-[rank3]:[W603 16:46:31.117388689 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank1]:[W603 16:46:31.130568896 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank0]:[W603 16:46:31.163744050 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[rank2]:[W603 16:46:31.206279493 ProcessGroupNCCL.cpp:4843] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can specify device_id in init_process_group() to force use of a particular device.
-[dist init] rank = 3, world_size = 4
-[dist init] rank = 0, world_size = 4
-[dist init] rank = 1, world_size = 4
-[dist init] rank = 2, world_size = 4
-ok
-
-======================================================================
-ERROR: test_inference_async_pipelining_with_interleaving (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest)
-----------------------------------------------------------------------
-Traceback (most recent call last):
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 605, in wrapper
-    self._join_processes(fn)
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 845, in _join_processes
-    self._check_return_codes(elapsed_time)
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 894, in _check_return_codes
-    raise RuntimeError(error)
-RuntimeError: Process 3 exited with error code 10 and exception:
-Traceback (most recent call last):
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-    getattr(self, test_name)()
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-    fn()
-  File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-    method(*args, **kwargs)
-  File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 278, in test_inference_async_pipelining_with_interleaving
-    self._forward_backward_test_impl(
-  File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-    loss = fwd_bwd_func(
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
-    p2p_communication.recv_forward(
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-    input_tensor, _ = _communicate(
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-    tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-    reqs = torch.distributed.batch_isend_irecv(ops)
-  File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-    p2p_op.op(
-  File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-    return group.recv([tensor], group_src, tag)
-torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-Last error:
-Error while creating shared memory segment /dev/shm/nccl-BZatXC (size 8257920), error: No space left on device (28)
-
-To execute this test, run the following from the base repo dir:
-    PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_with_interleaving
-
-This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-
-
-
-======================================================================
-ERROR: test_inference_async_pipelining_without_interleaving (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest)
-----------------------------------------------------------------------
-Traceback (most recent call last):
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 605, in wrapper
-    self._join_processes(fn)
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 845, in _join_processes
-    self._check_return_codes(elapsed_time)
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 894, in _check_return_codes
-    raise RuntimeError(error)
-RuntimeError: Process 1 exited with error code 10 and exception:
-Traceback (most recent call last):
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-    getattr(self, test_name)()
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-    fn()
-  File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-    method(*args, **kwargs)
-  File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 254, in test_inference_async_pipelining_without_interleaving
-    self._forward_backward_test_impl(
-  File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-    loss = fwd_bwd_func(
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 349, in forward_backward_pipelining_without_interleaving
-    send_forward(
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 145, in send_forward
-    p2p_communication.send_forward(
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 401, in send_forward
-    _communicate(
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-    tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-    reqs = torch.distributed.batch_isend_irecv(ops)
-  File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-    p2p_op.op(
-  File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
-    return group.send([tensor], group_dst, tag)
-torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-Last error:
-Error while creating shared memory segment /dev/shm/nccl-G7NK1l (size 8257920), error: No space left on device (28)
-
-To execute this test, run the following from the base repo dir:
-    PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_async_pipelining_without_interleaving
-
-This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-
-
-
-======================================================================
-ERROR: test_inference_pipelining_with_interleaving (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest)
-----------------------------------------------------------------------
-Traceback (most recent call last):
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 605, in wrapper
-    self._join_processes(fn)
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 845, in _join_processes
-    self._check_return_codes(elapsed_time)
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 894, in _check_return_codes
-    raise RuntimeError(error)
-RuntimeError: Process 7 exited with error code 10 and exception:
-Traceback (most recent call last):
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-    getattr(self, test_name)()
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-    fn()
-  File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-    method(*args, **kwargs)
-  File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 266, in test_inference_pipelining_with_interleaving
-    self._forward_backward_test_impl(
-  File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-    loss = fwd_bwd_func(
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
-    p2p_communication.recv_forward(
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-    input_tensor, _ = _communicate(
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-    tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-    reqs = torch.distributed.batch_isend_irecv(ops)
-  File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-    p2p_op.op(
-  File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-    return group.recv([tensor], group_src, tag)
-torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-Last error:
-Error while creating shared memory segment /dev/shm/nccl-lidQW9 (size 8257920), error: No space left on device (28)
-
-To execute this test, run the following from the base repo dir:
-    PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_with_interleaving
-
-This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-
-
-
-======================================================================
-ERROR: test_inference_pipelining_without_interleaving (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest)
-----------------------------------------------------------------------
-Traceback (most recent call last):
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 605, in wrapper
-    self._join_processes(fn)
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 845, in _join_processes
-    self._check_return_codes(elapsed_time)
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 894, in _check_return_codes
-    raise RuntimeError(error)
-RuntimeError: Process 1 exited with error code 10 and exception:
-Traceback (most recent call last):
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-    getattr(self, test_name)()
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-    fn()
-  File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-    method(*args, **kwargs)
-  File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 244, in test_inference_pipelining_without_interleaving
-    self._forward_backward_test_impl(
-  File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-    loss = fwd_bwd_func(
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 349, in forward_backward_pipelining_without_interleaving
-    send_forward(
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 145, in send_forward
-    p2p_communication.send_forward(
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 401, in send_forward
-    _communicate(
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-    tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-    reqs = torch.distributed.batch_isend_irecv(ops)
-  File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-    p2p_op.op(
-  File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
-    return group.send([tensor], group_dst, tag)
-torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-Last error:
-Error while creating shared memory segment /dev/shm/nccl-G3jTgB (size 8257920), error: No space left on device (28)
-
-To execute this test, run the following from the base repo dir:
-    PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_inference_pipelining_without_interleaving
-
-This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-
-
-
-======================================================================
-ERROR: test_learning_async_pipelining_with_interleaving (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest)
-----------------------------------------------------------------------
-Traceback (most recent call last):
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 605, in wrapper
-    self._join_processes(fn)
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 845, in _join_processes
-    self._check_return_codes(elapsed_time)
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 894, in _check_return_codes
-    raise RuntimeError(error)
-RuntimeError: Process 6 exited with error code 10 and exception:
-Traceback (most recent call last):
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-    getattr(self, test_name)()
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-    fn()
-  File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-    method(*args, **kwargs)
-  File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 272, in test_learning_async_pipelining_with_interleaving
-    self._forward_backward_test_impl(
-  File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-    loss = fwd_bwd_func(
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
-    p2p_communication.recv_forward(
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-    input_tensor, _ = _communicate(
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-    tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-    reqs = torch.distributed.batch_isend_irecv(ops)
-  File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-    p2p_op.op(
-  File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-    return group.recv([tensor], group_src, tag)
-torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-Last error:
-Error while creating shared memory segment /dev/shm/nccl-kcfrRq (size 8257920), error: No space left on device (28)
-
-To execute this test, run the following from the base repo dir:
-    PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_with_interleaving
-
-This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-
-
-
-======================================================================
-ERROR: test_learning_async_pipelining_without_interleaving (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest)
-----------------------------------------------------------------------
-Traceback (most recent call last):
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 605, in wrapper
-    self._join_processes(fn)
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 845, in _join_processes
-    self._check_return_codes(elapsed_time)
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 894, in _check_return_codes
-    raise RuntimeError(error)
-RuntimeError: Process 1 exited with error code 10 and exception:
-Traceback (most recent call last):
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-    getattr(self, test_name)()
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-    fn()
-  File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-    method(*args, **kwargs)
-  File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 249, in test_learning_async_pipelining_without_interleaving
-    self._forward_backward_test_impl(
-  File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-    loss = fwd_bwd_func(
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 349, in forward_backward_pipelining_without_interleaving
-    send_forward(
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 145, in send_forward
-    p2p_communication.send_forward(
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 401, in send_forward
-    _communicate(
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-    tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-    reqs = torch.distributed.batch_isend_irecv(ops)
-  File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-    p2p_op.op(
-  File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
-    return group.send([tensor], group_dst, tag)
-torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-Last error:
-Error while creating shared memory segment /dev/shm/nccl-OSTKDz (size 8257920), error: No space left on device (28)
-
-To execute this test, run the following from the base repo dir:
-    PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_async_pipelining_without_interleaving
-
-This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-
-
-
-======================================================================
-ERROR: test_learning_pipelining_with_interleaving (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest)
-----------------------------------------------------------------------
-Traceback (most recent call last):
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 605, in wrapper
-    self._join_processes(fn)
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 845, in _join_processes
-    self._check_return_codes(elapsed_time)
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 894, in _check_return_codes
-    raise RuntimeError(error)
-RuntimeError: Process 3 exited with error code 10 and exception:
-Traceback (most recent call last):
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-    getattr(self, test_name)()
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-    fn()
-  File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-    method(*args, **kwargs)
-  File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 260, in test_learning_pipelining_with_interleaving
-    self._forward_backward_test_impl(
-  File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-    loss = fwd_bwd_func(
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py", line 219, in _forward_backward_pipelining_with_interleaving
-    p2p_communication.recv_forward(
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 342, in recv_forward
-    input_tensor, _ = _communicate(
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-    tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-    reqs = torch.distributed.batch_isend_irecv(ops)
-  File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-    p2p_op.op(
-  File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2411, in irecv
-    return group.recv([tensor], group_src, tag)
-torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-Last error:
-Error while creating shared memory segment /dev/shm/nccl-UrziMF (size 8257920), error: No space left on device (28)
-
-To execute this test, run the following from the base repo dir:
-    PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_with_interleaving
-
-This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-
-
-
-======================================================================
-ERROR: test_learning_pipelining_without_interleaving (test_pipeline_parallel_fwd_bwd.NcclPipelineParallelForwardBackwardTest)
-----------------------------------------------------------------------
-Traceback (most recent call last):
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 605, in wrapper
-    self._join_processes(fn)
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 845, in _join_processes
-    self._check_return_codes(elapsed_time)
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 894, in _check_return_codes
-    raise RuntimeError(error)
-RuntimeError: Process 1 exited with error code 10 and exception:
-Traceback (most recent call last):
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 734, in run_test
-    getattr(self, test_name)()
-  File "/skishore/github/pytorch/torch/testing/_internal/common_distributed.py", line 607, in wrapper
-    fn()
-  File "/skishore/github/pytorch/torch/testing/_internal/common_utils.py", line 3173, in wrapper
-    method(*args, **kwargs)
-  File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 239, in test_learning_pipelining_without_interleaving
-    self._forward_backward_test_impl(
-  File "/skishore/github/apex/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py", line 180, in _forward_backward_test_impl
-    loss = fwd_bwd_func(
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 349, in forward_backward_pipelining_without_interleaving
-    send_forward(
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py", line 145, in send_forward
-    p2p_communication.send_forward(
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 401, in send_forward
-    _communicate(
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 259, in _communicate
-    tensor_send_prev_req, tensor_recv_prev_req, tensor_send_next_req, tensor_recv_next_req = _run_p2pops(tensor_send_prev, tensor_send_next, tensor_recv_prev, tensor_recv_next, async_comm=async_comm)
-  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/apex/transformer/pipeline_parallel/p2p_communication.py", line 97, in _run_p2pops
-    reqs = torch.distributed.batch_isend_irecv(ops)
-  File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2717, in batch_isend_irecv
-    p2p_op.op(
-  File "/skishore/github/pytorch/torch/distributed/distributed_c10d.py", line 2366, in isend
-    return group.send([tensor], group_dst, tag)
-torch.distributed.DistBackendError: NCCL error in: /skishore/github/pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp:77, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.25.1
-ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. 
-Last error:
-Error while creating shared memory segment /dev/shm/nccl-abqPSD (size 8257920), error: No space left on device (28)
-
-To execute this test, run the following from the base repo dir:
-    PYTORCH_TEST_WITH_ROCM=1 python test_pipeline_parallel_fwd_bwd.py NcclPipelineParallelForwardBackwardTest.test_learning_pipelining_without_interleaving
-
-This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
-
-
-
-----------------------------------------------------------------------
-Ran 102 tests in 2937.884s
-
-FAILED (errors=8, skipped=44)
diff --git a/tests/L0/run_amp/test_fused_sgd.py b/tests/L0/run_amp/test_fused_sgd.py
index 99d01855a..480cd1132 100644
--- a/tests/L0/run_amp/test_fused_sgd.py
+++ b/tests/L0/run_amp/test_fused_sgd.py
@@ -14,8 +14,7 @@
     ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
 
 try:
-  from apex.op_builder import AmpCBuilder
-  amp_C = AmpCBuilder().load()
+  import amp_C
   disabled = False
   from apex.optimizers import FusedSGD as FusedSGD
 except ImportError as err:
diff --git a/tests/L0/run_amp/test_multi_tensor_axpby.py b/tests/L0/run_amp/test_multi_tensor_axpby.py
index bbab05aa6..4921378a2 100644
--- a/tests/L0/run_amp/test_multi_tensor_axpby.py
+++ b/tests/L0/run_amp/test_multi_tensor_axpby.py
@@ -13,8 +13,7 @@
     ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT, common_reset
 
 try:
-  from apex.op_builder import AmpCBuilder
-  amp_C = AmpCBuilder().load()
+  import amp_C
   from amp_C import multi_tensor_axpby
   from apex.multi_tensor_apply import MultiTensorApply
   disabled = False
diff --git a/tests/L0/run_amp/test_multi_tensor_l2norm.py b/tests/L0/run_amp/test_multi_tensor_l2norm.py
index d546690de..bb28e52d2 100644
--- a/tests/L0/run_amp/test_multi_tensor_l2norm.py
+++ b/tests/L0/run_amp/test_multi_tensor_l2norm.py
@@ -12,8 +12,7 @@
     ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT, common_reset
 
 try:
-  from apex.op_builder import AmpCBuilder
-  amp_C = AmpCBuilder().load()
+  import amp_C
   from amp_C import multi_tensor_l2norm
   from apex.multi_tensor_apply import MultiTensorApply
   disabled = False
diff --git a/tests/L0/run_amp/test_multi_tensor_scale.py b/tests/L0/run_amp/test_multi_tensor_scale.py
index 85f60fd0e..f97109c9e 100644
--- a/tests/L0/run_amp/test_multi_tensor_scale.py
+++ b/tests/L0/run_amp/test_multi_tensor_scale.py
@@ -12,8 +12,7 @@
     ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT, common_reset
 
 try:
-  from apex.op_builder import AmpCBuilder
-  amp_C = AmpCBuilder().load()
+  import amp_C
   from amp_C import multi_tensor_scale
   from apex.multi_tensor_apply import MultiTensorApply
   disabled = False
diff --git a/tests/L0/run_optimizers/test_lamb.py b/tests/L0/run_optimizers/test_lamb.py
index d1c4f70c2..c6ef9aa95 100644
--- a/tests/L0/run_optimizers/test_lamb.py
+++ b/tests/L0/run_optimizers/test_lamb.py
@@ -4,7 +4,7 @@
 import torch
 from torch.optim import Optimizer
 import apex
-from apex.multi_tensor_apply import MultiTensorApply
+from apex.multi_tensor_apply import multi_tensor_applier
 from itertools import product
 
 class RefLAMB(Optimizer):
@@ -37,10 +37,8 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0
             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
         super(RefLAMB, self).__init__(params, defaults)
-        multi_tensor_applier = MultiTensorApply(256*32)
         if multi_tensor_applier.available:
-            from apex.op_builder import AmpCBuilder
-            amp_C = AmpCBuilder().load()
+            import amp_C
             self.multi_tensor_l2norm=amp_C.multi_tensor_l2norm
             # Skip buffer
             self._dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device=self.param_groups[0]["params"][0].device)
@@ -74,7 +72,6 @@ def step(self, closure=None):
         device = self.param_groups[0]["params"][0].device
         g_norm_32, g_norm_16 = torch.zeros(1, device=device), torch.zeros(1, device=device)
         # compute grad norm for two lists
-        multi_tensor_applier = MultiTensorApply(256*32)
         if len(g_all_32) > 0:
             g_norm_32 = multi_tensor_applier(self.multi_tensor_l2norm,
                                              self._dummy_overflow_buf,
diff --git a/tests/L0/run_transformer/test_fused_bias_swiglu.py b/tests/L0/run_transformer/test_fused_bias_swiglu.py
index 66cf4c1e8..e7c2e4793 100644
--- a/tests/L0/run_transformer/test_fused_bias_swiglu.py
+++ b/tests/L0/run_transformer/test_fused_bias_swiglu.py
@@ -1,9 +1,8 @@
 import torch
-from apex.op_builder import FusedBiasSwiGLUBuilder
+import fused_bias_swiglu
 from torch.testing._internal import common_utils
 import torch.nn.functional as F
 
-fused_bias_swiglu = FusedBiasSwiGLUBuilder().load()
 
 class TestFusedBiasSwiGLU(common_utils.TestCase):
 
diff --git a/tests/L1/common/main_amp.py b/tests/L1/common/main_amp.py
index 93623068d..106a0f637 100644
--- a/tests/L1/common/main_amp.py
+++ b/tests/L1/common/main_amp.py
@@ -21,7 +21,7 @@
     from apex.parallel import DistributedDataParallel as DDP
     from apex.fp16_utils import *
     from apex import amp, optimizers
-    from apex.multi_tensor_apply import MultiTensorApply
+    from apex.multi_tensor_apply import multi_tensor_applier
 except ImportError:
     raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this example.")
 
@@ -99,7 +99,6 @@ def fast_collate(batch):
 
 # Let multi_tensor_applier be the canary in the coalmine
 # that verifies if the backend is what we think it is
-multi_tensor_applier = MultiTensorApply(256*32)
 assert multi_tensor_applier.available == args.has_ext 
 
 print("opt_level = {}".format(args.opt_level))
diff --git a/tests/distributed/synced_batchnorm/single_gpu_unit_test.py b/tests/distributed/synced_batchnorm/single_gpu_unit_test.py
index 93f187fb1..446b6b0b7 100644
--- a/tests/distributed/synced_batchnorm/single_gpu_unit_test.py
+++ b/tests/distributed/synced_batchnorm/single_gpu_unit_test.py
@@ -3,8 +3,7 @@
 import apex
 if True:
     print("using setup tools")
-    from apex.op_builder import SyncBnBuilder
-    syncbn = SyncBnBuilder().load()
+    import syncbn
 else:
     print("using jit")
     from torch.utils.cpp_extension import load
@@ -31,7 +30,7 @@ def compare(desc, inp1, inp2, error):
 error = 1e-5
 
 np.random.seed(1)
-dtype = np.float64
+dtype = np.float32
 inp = (np.random.randn(batch_size, feature_size, space_size, space_size)).astype(dtype)
 grad = (np.random.randn(batch_size, feature_size, space_size, space_size)).astype(dtype)
 weight = (np.random.randn(feature_size)).astype(dtype)
diff --git a/tests/distributed/synced_batchnorm/test_groups.py b/tests/distributed/synced_batchnorm/test_groups.py
index 74b2a9b13..674f8e60a 100644
--- a/tests/distributed/synced_batchnorm/test_groups.py
+++ b/tests/distributed/synced_batchnorm/test_groups.py
@@ -1,8 +1,7 @@
 import torch
 import numpy as np
 import apex
-from apex.op_builder import SyncBnBuilder
-syncbn = SyncBnBuilder().load()
+import syncbn
 import os
 import argparse
 import torch.optim as optim
diff --git a/tests/distributed/synced_batchnorm/two_gpu_unit_test.py b/tests/distributed/synced_batchnorm/two_gpu_unit_test.py
index 794da411f..5daeef48a 100644
--- a/tests/distributed/synced_batchnorm/two_gpu_unit_test.py
+++ b/tests/distributed/synced_batchnorm/two_gpu_unit_test.py
@@ -3,8 +3,7 @@
 import torch
 import numpy as np
 import apex
-from apex.op_builder import SyncBnBuilder
-syncbn = SyncBnBuilder().load()
+import syncbn
 import os
 import argparse
 import torch.optim as optim

From c21c31aeb3c32f3ff7be29816cab816b6cdc5eac Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Wed, 13 Aug 2025 18:38:06 +0000
Subject: [PATCH 59/79] renamed compatibility/scaled_masked_softmax_cuda.py,
 added some extra tests in the contrib test runner

---
 apex/contrib/test/run_rocm_extensions.py                       | 3 ++-
 ...ed_masked-softmax_cuda.py => scaled_masked_softmax_cuda.py} | 0
 2 files changed, 2 insertions(+), 1 deletion(-)
 rename compatibility/{scaled_masked-softmax_cuda.py => scaled_masked_softmax_cuda.py} (100%)

diff --git a/apex/contrib/test/run_rocm_extensions.py b/apex/contrib/test/run_rocm_extensions.py
index c7801988b..3ad633cc2 100644
--- a/apex/contrib/test/run_rocm_extensions.py
+++ b/apex/contrib/test/run_rocm_extensions.py
@@ -2,7 +2,8 @@
 import sys
 
 
-test_dirs = ["groupbn", "fused_dense", "layer_norm", "multihead_attn", "transducer", "focal_loss", "index_mul_2d", "."] # "." for test_label_smoothing.py
+test_dirs = ["groupbn", "layer_norm", "multihead_attn", "transducer", "focal_loss", "index_mul_2d", ".", \
+              "optimizers", "clip_grad"] # "." for test_label_smoothing.py
 ROCM_BLACKLIST = [
     "layer_norm"
 ]
diff --git a/compatibility/scaled_masked-softmax_cuda.py b/compatibility/scaled_masked_softmax_cuda.py
similarity index 100%
rename from compatibility/scaled_masked-softmax_cuda.py
rename to compatibility/scaled_masked_softmax_cuda.py

From 7d2bb4cd8bc5064310c02b766d67b27343378a49 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Wed, 13 Aug 2025 19:33:26 +0000
Subject: [PATCH 60/79] Added instructions for JIT load and changes in
 installation options

---
 README.md | 87 ++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 57 insertions(+), 30 deletions(-)

diff --git a/README.md b/README.md
index 89fe3ad5e..886fab545 100644
--- a/README.md
+++ b/README.md
@@ -112,9 +112,9 @@ The latest stable release obtainable from https://pytorch.org should also work.
 Apex on ROCm supports both python only build and extension build.
 Note: Pytorch version recommended is >=1.5 for extension build.
 
-### To install using python only build use the following command in apex folder:
+### To install and jit load the extensions use the following command in apex folder:
 ```
-python setup.py install
+pip install . --no-build-isolation
 ```
 
 =======
@@ -140,26 +140,67 @@ ubuntu|pytorch|apex|release/1.0.0|06c33eee43f7a22f3ed7d9c3e5be0ddd757dc345|https
 centos|pytorch|apex|release/1.0.0|06c33eee43f7a22f3ed7d9c3e5be0ddd757dc345|https://github.com/ROCmSoftwarePlatform/apex
 ```
 
-### To install using extensions enabled use the following command in apex folder:
+### To install the extensions while installing use the following command in apex folder:
+```
+APEX_BUILD_CPP_OPS=1 APEX_BUILD_CUDA_OPS=1 pip install . --no-build-isolation
 ```
-# if pip >= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key...
-pip install -v --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
-# otherwise
-python setup.py install --cpp_ext --cuda_ext
 
+It is possible to build specific extensions by using the following command in apex folder:
+```
+APEX_BUILD_<OP_NAME>=1 pip install . --no-build-isolation
+```
+The following extensions are supported:
+| extension | environment to build specific extension | install option |
+|-----------|-----------|-----------|
+| amp_C | APEX_BUILD_AMP_C=1 | APEX_BUILD_CUDA_OPS=1 |
+| apex_C | APEX_BUILD_APEX_C=1 | APEX_BUILD_CPP_OPS=1 |
+| bnp | APEX_BUILD_BNP=1 | APEX_BUILD_CUDA_OPS=1 |
+| distributed_adam_cuda | APEX_BUILD_DISTRIBUTED_ADAM=1 | APEX_BUILD_CUDA_OPS=1 |
+| distributed_lamb_cuda | APEX_BUILD_DISTRIBUTED_LAMB=1 | APEX_BUILD_CUDA_OPS=1 |
+| fast_multihead_attn | APEX_BUILD_FAST_MULTIHEAD_ATTN=1 | APEX_BUILD_CUDA_OPS=1 |
+| focal_loss_cuda | APEX_BUILD_FOCAL_LOSS=1 | APEX_BUILD_CUDA_OPS=1 |
+| fused_adam_cuda | APEX_BUILD_FUSED_ADAM=1 | APEX_BUILD_CUDA_OPS=1 |
+| fused_bias_swiglu | APEX_BUILD_FUSED_BIAS_SWIGLU=1 | APEX_BUILD_CUDA_OPS=1 |
+| fused_dense_cuda | APEX_BUILD_FUSED_DENSE=1 | APEX_BUILD_CUDA_OPS=1 |
+| fused_index_mul_2d | APEX_BUILD_FUSED_INDEX_MUL_2D=1 | APEX_BUILD_CUDA_OPS=1 |
+| fused_lamb_cuda | APEX_BUILD_FUSED_LAMB=1 | APEX_BUILD_CUDA_OPS=1 |
+| fused_layer_norm_cuda | APEX_BUILD_FUSED_LAYER_NORM=1 | APEX_BUILD_CUDA_OPS=1 |
+| fused_rotary_positional_embedding | APEX_BUILD_FUSED_ROPE=1 | APEX_BUILD_CUDA_OPS=1 |
+| fused_weight_gradient_mlp_cuda | APEX_BUILD_FUSED_WEIGHT_GRADIENT_MLP=1 | APEX_BUILD_CUDA_OPS=1 |
+| generic_scaled_masked_softmax_cuda | APEX_BUILD_GENERIC_SCALED_MASKED_SOFTMAX_CUDA=1 | APEX_BUILD_CUDA_OPS=1 |
+| mlp_cuda | APEX_BUILD_MLP=1 | APEX_BUILD_CUDA_OPS=1 |
+| _apex_nccl_allocator | APEX_BUILD_NCCL_ALLOCATOR=1 | APEX_BUILD_CUDA_OPS=1 |
+| nccl_p2p_cuda | APEX_BUILD_NCCL_P2P=1 | APEX_BUILD_CUDA_OPS=1 |
+| peer_memory_cuda | APEX_BUILD_PEER_MEMORY=1 | APEX_BUILD_CUDA_OPS=1 |
+| scaled_masked_softmax_cuda | APEX_BUILD_SCALED_MASKED_SOFTMAX_CUDA=1 | APEX_BUILD_CUDA_OPS=1 |
+| scaled_softmax_cuda | APEX_BUILD_SCALED_SOFTMAX_CUDA=1 | APEX_BUILD_CUDA_OPS=1 |
+| scaled_upper_triang_masked_softmax_cuda | APEX_BUILD_SCALED_UPPER_TRIANG_MASKED_SOFTMAX_CUDA=1 | APEX_BUILD_CUDA_OPS=1 |
+| syncbn | APEX_BUILD_SYNCBN=1 | APEX_BUILD_CUDA_OPS=1 |
+| transducer_joint_cuda | APEX_BUILD_TRANSDUCER_JOINT=1 | APEX_BUILD_CUDA_OPS=1 |
+| transducer_loss_cuda | APEX_BUILD_TRANSDUCER_LOSS=1 | APEX_BUILD_CUDA_OPS=1 |
+| xentropy_cuda | APEX_BUILD_XENTROPY=1 | APEX_BUILD_CUDA_OPS=1 |
+
+For example, to build FUSED_DENSE​ you can use the following command: 
 ```
-Note that using --cuda_ext flag to install Apex will also enable all the extensions supported on ROCm including "--distributed_adam", "--distributed_lamb", "--bnp", "--xentropy", "--deprecated_fused_adam", "--deprecated_fused_lamb", and "--fast_multihead_attn".
+APEX_BUILD_FUSED_DENSE​=1 pip install . --no-build-isolation
+```
+This will install FUSED_DENSE​ module and rest of the modules are JIT loaded. 
+
+
 
-In addition, aiter backend can be built during apex installation by providing --aiter flag
+Aiter backend can be built and used for fused rope. To install aiter:
 ```
-# if pip >= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key...
-pip install -v --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" --config-settings "--build-option=--aiter" ./
-# otherwise
-python setup.py install --cpp_ext --cuda_ext --aiter
+make aiter
 ```
 
 To use aiter in fused rope, you can use the flag ```USE_ROCM_AITER_ROPE_BACKEND=1```.
 
+### To uninstall apex and its extensions, use the following command in apex folder:
+```
+pip uninstall apex
+make clean
+```
+
 ### Enable hipblasLT on ROCm
 hipblasLT is supported only on mi300 (gfx942) only.  
 python setup.py automatically builds apex with hipblasLT support only if GPU device id is gfx942  
@@ -173,27 +214,13 @@ CUDA and C++ extensions via
 ```bash
 git clone https://github.com/rocm/apex
 cd apex
-# if pip >= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key...
-pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
-# otherwise
-pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --global-option="--cpp_ext" --global-option="--cuda_ext" ./
-```
-
-Apex also supports a Python-only build via
-```bash
-pip install -v --disable-pip-version-check --no-build-isolation --no-cache-dir ./
+pip install . --no-build-isolation
 ```
-A Python-only build omits:
-- Fused kernels required to use `apex.optimizers.FusedAdam`.
-- Fused kernels required to use `apex.normalization.FusedLayerNorm` and `apex.normalization.FusedRMSNorm`.
-- Fused kernels that improve the performance and numerical stability of `apex.parallel.SyncBatchNorm`.
-- Fused kernels that improve the performance of `apex.parallel.DistributedDataParallel` and `apex.amp`.
-`DistributedDataParallel`, `amp`, and `SyncBatchNorm` will still be usable, but they may be slower.
 
 
 ### [Experimental] Windows
-`pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .` may work if you were able to build Pytorch from source
-on your system. A Python-only build via `pip install -v --no-cache-dir .` is more likely to work.  
+`pip install . --no-build-isolation` may work if you were able to build Pytorch from source
+on your system. A Python-only build via `pip install --no-build-isolation -v --no-cache-dir .` is more likely to work.  
 If you installed Pytorch in a Conda environment, make sure to install Apex in that same environment.
 
 

From f80e4340feb90fbe028d48d99500946b4d5e25a4 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Wed, 13 Aug 2025 19:41:35 +0000
Subject: [PATCH 61/79] Restructuring the README

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index 886fab545..e38b59218 100644
--- a/README.md
+++ b/README.md
@@ -117,7 +117,6 @@ Note: Pytorch version recommended is >=1.5 for extension build.
 pip install . --no-build-isolation
 ```
 
-=======
 ### Supported Versions
 | ``APEX Version`` | ``APEX branch`` | ``Torch Version`` |
 |------------------|-----------------|-------------------|

From 4b4b774a71b87555f401ab096aeca367852fc97e Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Wed, 13 Aug 2025 19:46:56 +0000
Subject: [PATCH 62/79] Added instructions for building wheel

---
 README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/README.md b/README.md
index e38b59218..b1d89483e 100644
--- a/README.md
+++ b/README.md
@@ -194,6 +194,12 @@ make aiter
 
 To use aiter in fused rope, you can use the flag ```USE_ROCM_AITER_ROPE_BACKEND=1```.
 
+### To create a wheel and then install using the wheel, use the following command in apex folder:
+```
+python -m build --wheel --no-isolation (can use the same environment variables to build specific extensions, cpp extensions and cuda extensions)
+pip install dist/apex-*.whl​
+```
+
 ### To uninstall apex and its extensions, use the following command in apex folder:
 ```
 pip uninstall apex

From 71f9d67c048b4489fd3293a6f9fb81fcddc3107c Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Fri, 31 Oct 2025 12:19:11 +0000
Subject: [PATCH 63/79] replaced TorchCPUBuilder with CPUBuilder, added a main
 method in contrib test runner

---
 apex/contrib/test/run_rocm_extensions.py | 23 ++++++++++++-----------
 op_builder/__init__.py                   |  2 +-
 op_builder/all_ops.py                    |  3 +--
 op_builder/apex_C.py                     |  4 ++--
 op_builder/builder.py                    |  2 +-
 5 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/apex/contrib/test/run_rocm_extensions.py b/apex/contrib/test/run_rocm_extensions.py
index 3ad633cc2..1c9add5d8 100644
--- a/apex/contrib/test/run_rocm_extensions.py
+++ b/apex/contrib/test/run_rocm_extensions.py
@@ -8,20 +8,21 @@
     "layer_norm"
 ]
 
-runner = unittest.TextTestRunner(verbosity=2)
+if __name__ ==  '__main__':
+    runner = unittest.TextTestRunner(verbosity=2)
 
-errcode = 0
+    errcode = 0
 
-for test_dir in test_dirs:
-    if test_dir in ROCM_BLACKLIST:
-        continue
-    suite = unittest.TestLoader().discover(test_dir)
+    for test_dir in test_dirs:
+        if test_dir in ROCM_BLACKLIST:
+            continue
+        suite = unittest.TestLoader().discover(test_dir)
 
-    print("\nExecuting tests from " + test_dir)
+        print("\nExecuting tests from " + test_dir)
 
-    result = runner.run(suite)
+        result = runner.run(suite)
 
-    if not result.wasSuccessful():
-        errcode = 1
+        if not result.wasSuccessful():
+            errcode = 1
 
-sys.exit(errcode)
+    sys.exit(errcode)
diff --git a/op_builder/__init__.py b/op_builder/__init__.py
index 5b1b484c9..34dbd96c3 100644
--- a/op_builder/__init__.py
+++ b/op_builder/__init__.py
@@ -46,7 +46,7 @@ def _builder():
     if module_name != 'all_ops' and module_name != 'builder':
         module = importlib.import_module(f".{module_name}", package=op_builder_dir)
         for member_name in module.__dir__():
-            if member_name.endswith('Builder') and member_name != "OpBuilder" and member_name != "CUDAOpBuilder":
+            if member_name.endswith('Builder') and member_name != "OpBuilder" and member_name != "CUDAOpBuilder" and member_name != "CPUOpBuilder":
                 # assign builder name to variable with same name
                 # the following is equivalent to i.e. TransformerBuilder = "TransformerBuilder"
                 this_module.__dict__[member_name] = builder_closure(member_name)
\ No newline at end of file
diff --git a/op_builder/all_ops.py b/op_builder/all_ops.py
index 41fa091b6..2c0f9166d 100644
--- a/op_builder/all_ops.py
+++ b/op_builder/all_ops.py
@@ -41,7 +41,7 @@ def _lazy_init_class_dict(self):
                     for member_name in module.__dir__():
                         if member_name.endswith(
                                 'Builder'
-                        ) and member_name != "OpBuilder" and member_name != "CUDAOpBuilder" and member_name != "TorchCPUOpBuilder":  # avoid abstract classes
+                        ) and member_name != "OpBuilder" and member_name != "CUDAOpBuilder" and member_name != "CPUOpBuilder":  # avoid abstract classes
                             if not member_name in self.class_dict:
                                 self.class_dict[member_name] = getattr(module, member_name)
             # end initialize for create_op_builder()
@@ -68,7 +68,6 @@ def get_op_builder(self, class_name):
 builder_utils = BuilderUtils()
 op_builder_dir = builder_utils.op_builder_dir()
 op_builder_module = importlib.import_module(op_builder_dir)
-print ("op_builder_module", op_builder_module)
 __op_builders__ = []
 
 for _, module_name, _ in pkgutil.iter_modules([os.path.dirname(op_builder_module.__file__)]):
diff --git a/op_builder/apex_C.py b/op_builder/apex_C.py
index 2098b3b88..b02526e77 100644
--- a/op_builder/apex_C.py
+++ b/op_builder/apex_C.py
@@ -1,9 +1,9 @@
-from .builder import TorchCPUOpBuilder
+from .builder import CPUOpBuilder
 
 import sys
 
 
-class ApexCBuilder(TorchCPUOpBuilder):
+class ApexCBuilder(CPUOpBuilder):
     BUILD_VAR = 'APEX_BUILD_APEX_C'
     INCLUDE_FLAG = "APEX_BUILD_CPP_OPS"
     NAME = "apex_C"
diff --git a/op_builder/builder.py b/op_builder/builder.py
index 6784f17e0..9b8e78502 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -873,7 +873,7 @@ def torch_version(self):
     def is_supported(self):
         return super().is_supported()
 
-class TorchCPUOpBuilder(CUDAOpBuilder):
+class CPUOpBuilder(CUDAOpBuilder):
 
     def get_cuda_lib64_path(self):
         import torch

From a569854efdfb04b96f2333427b962b37a48d5317 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Mon, 10 Nov 2025 10:46:18 +0000
Subject: [PATCH 64/79] create a script to build different jit conditions for
 running different tests

---
 tests/jit_build/build.sh | 89 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 89 insertions(+)
 create mode 100644 tests/jit_build/build.sh

diff --git a/tests/jit_build/build.sh b/tests/jit_build/build.sh
new file mode 100644
index 000000000..f7a4858b1
--- /dev/null
+++ b/tests/jit_build/build.sh
@@ -0,0 +1,89 @@
+#parse the arguments
+JIT_CONDITION="$2"
+echo "JIT_CONDITION $JIT_CONDITION"
+
+cd ../..
+
+git checkout Refactor_build
+git submodule update --init --recursive 
+
+# uninstall apex 
+pip uninstall apex 
+make clean
+
+#install apex for different conditions 
+if [ "$JIT_CONDITION" = "1" ]; then
+    pip install . --no-build-isolation 
+    LOG_FILE=results_jit_unit_test1.log
+    LOG_FILE2=results_jit_unit_test1c.log
+elif [ "$JIT_CONDITION" = "2" ]; then
+    APEX_BUILD_CPP_OPS=1 pip install . --no-build-isolation
+    LOG_FILE=results_jit_unit_test2.log
+    LOG_FILE2=results_jit_unit_test2c.log
+elif [ "$JIT_CONDITION" = "3" ]; then
+    APEX_BUILD_CUDA_OPS=1 pip install . --no-build-isolation
+    LOG_FILE=results_jit_unit_test3.log
+    LOG_FILE2=results_jit_unit_test3c.log
+elif [ "$JIT_CONDITION" = "4" ]; then
+    APEX_BUILD_CPP_OPS=1 APEX_BUILD_CUDA_OPS=1 pip install . --no-build-isolation
+    LOG_FILE=results_jit_unit_test4.log
+    LOG_FILE2=results_jit_unit_test4c.log
+elif [ "$JIT_CONDITION" = "5" ]; then
+    APEX_BUILD_FUSED_DENSE=1 pip install . --no-build-isolation
+    LOG_FILE=results_jit_unit_test5.log
+    LOG_FILE2=results_jit_unit_test5c.log
+elif [ "$JIT_CONDITION" = "6" ]; then
+    python setup.py install --cpp_ext --cuda_ext
+    LOG_FILE=results_jit_unit_test6.log
+    LOG_FILE2=results_jit_unit_test6c.log
+elif [ "$JIT_CONDITION" = "7" ]; then
+    APEX_BUILD_AMP_C=1 APEX_BUILD_APEX_C=1 APEX_BUILD_BNP=1 \
+    APEX_BUILD_DISTRIBUTED_ADAM=1 APEX_BUILD_DISTRIBUTED_LAMB=1 APEX_BUILD_FAST_MULTIHEAD_ATTN=1 \
+    APEX_BUILD_FOCAL_LOSS=1 APEX_BUILD_FUSED_ADAM=1 APEX_BUILD_FUSED_BIAS_SWIGLU=1 \
+    APEX_BUILD_FUSED_DENSE=1 APEX_BUILD_FUSED_INDEX_MUL_2D=1 APEX_BUILD_FUSED_LAMB=1 \
+    APEX_BUILD_FUSED_LAYER_NORM=1 APEX_BUILD_FUSED_ROPE=1 APEX_BUILD_FUSED_WEIGHT_GRADIENT_MLP=1 \
+    APEX_BUILD_GENERIC_SCALED_MASKED_SOFTMAX_CUDA=1 APEX_BUILD_MLP=1 APEX_BUILD_NCCL_ALLOCATOR=1 \
+    APEX_BUILD_NCCL_P2P=1 APEX_BUILD_PEER_MEMORY=1 APEX_BUILD_SCALED_MASKED_SOFTMAX_CUDA=1 \
+    APEX_BUILD_SCALED_SOFTMAX_CUDA=1 APEX_BUILD_SCALED_UPPER_TRIANG_MASKED_SOFTMAX_CUDA=1 APEX_BUILD_SYNCBN=1 \
+    APEX_BUILD_TRANSDUCER_JOINT=1 APEX_BUILD_TRANSDUCER_LOSS=1 APEX_BUILD_XENTROPY=1 pip install . --no-build-isolation
+    LOG_FILE=results_jit_unit_test7.log
+    LOG_FILE2=results_jit_unit_test7c.log
+elif [ "$JIT_CONDITION" = "8" ]; then
+    python -m build --wheel --no-isolation .
+    pip install dist/apex-*.whl 
+    LOG_FILE=results_jit_unit_test8.log
+    LOG_FILE2=results_jit_unit_test8c.log
+elif [ "$JIT_CONDITION" = "9" ]; then
+    APEX_BUILD_CPP_OPS=1 python -m build --wheel --no-isolation .
+    pip install dist/apex-*.whl 
+    LOG_FILE=results_jit_unit_test9.log
+    LOG_FILE2=results_jit_unit_test9c.log
+elif [ "$JIT_CONDITION" = "10" ]; then
+    APEX_BUILD_CUDA_OPS=1 python -m build --wheel --no-isolation .
+    pip install dist/apex-*.whl 
+    LOG_FILE=results_jit_unit_test10.log
+    LOG_FILE2=results_jit_unit_test10c.log
+elif [ "$JIT_CONDITION" = "11" ]; then
+    APEX_BUILD_CPP_OPS=1 APEX_BUILD_CUDA_OPS=1 python -m build --wheel --no-isolation .
+    pip install dist/apex-*.whl 
+    LOG_FILE=results_jit_unit_test11.log
+    LOG_FILE2=results_jit_unit_test11c.log
+elif [ "$JIT_CONDITION" = "12" ]; then
+    APEX_BUILD_FUSED_DENSE=1 python -m build --wheel --no-isolation .
+    pip install dist/apex-*.whl 
+    LOG_FILE=results_jit_unit_test12.log
+    LOG_FILE2=results_jit_unit_test12c.log
+elif [ "$JIT_CONDITION" = "13" ]; then
+    APEX_BUILD_AMP_C=1 APEX_BUILD_APEX_C=1 APEX_BUILD_BNP=1 \
+    APEX_BUILD_DISTRIBUTED_ADAM=1 APEX_BUILD_DISTRIBUTED_LAMB=1 APEX_BUILD_FAST_MULTIHEAD_ATTN=1 \
+    APEX_BUILD_FOCAL_LOSS=1 APEX_BUILD_FUSED_ADAM=1 APEX_BUILD_FUSED_BIAS_SWIGLU=1 \
+    APEX_BUILD_FUSED_DENSE=1 APEX_BUILD_FUSED_INDEX_MUL_2D=1 APEX_BUILD_FUSED_LAMB=1 \
+    APEX_BUILD_FUSED_LAYER_NORM=1 APEX_BUILD_FUSED_ROPE=1 APEX_BUILD_FUSED_WEIGHT_GRADIENT_MLP=1 \
+    APEX_BUILD_GENERIC_SCALED_MASKED_SOFTMAX_CUDA=1 APEX_BUILD_MLP=1 APEX_BUILD_NCCL_ALLOCATOR=1 \
+    APEX_BUILD_NCCL_P2P=1 APEX_BUILD_PEER_MEMORY=1 APEX_BUILD_SCALED_MASKED_SOFTMAX_CUDA=1 \
+    APEX_BUILD_SCALED_SOFTMAX_CUDA=1 APEX_BUILD_SCALED_UPPER_TRIANG_MASKED_SOFTMAX_CUDA=1 APEX_BUILD_SYNCBN=1 \
+    APEX_BUILD_TRANSDUCER_JOINT=1 APEX_BUILD_TRANSDUCER_LOSS=1 APEX_BUILD_XENTROPY=1 python -m build --wheel --no-isolation .
+    pip install dist/apex-*.whl 
+    LOG_FILE=results_jit_unit_test13.log
+    LOG_FILE2=results_jit_unit_test13c.log
+fi 
\ No newline at end of file

From f263567ce4dc49868995d53195e5219e7387eb91 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Tue, 11 Nov 2025 15:14:59 +0000
Subject: [PATCH 65/79] add script to run tests with different jit builds, add
 instructions to run jit build and tests in readme, add other tests in readme

---
 README.md                                  | 38 +++++++++++++++++++---
 tests/jit_build/build.sh                   | 31 ++----------------
 tests/jit_build/count_built_so.py          | 11 +++++++
 tests/jit_build/count_failed_unit_tests.py | 16 +++++++++
 tests/jit_build/count_torch_extensions.py  |  9 +++++
 tests/jit_build/load_extra_extensions.py   | 16 +++++++++
 tests/jit_build/run_tests.sh               | 35 ++++++++++++++++++++
 7 files changed, 122 insertions(+), 34 deletions(-)
 create mode 100644 tests/jit_build/count_built_so.py
 create mode 100644 tests/jit_build/count_failed_unit_tests.py
 create mode 100644 tests/jit_build/count_torch_extensions.py
 create mode 100644 tests/jit_build/load_extra_extensions.py
 create mode 100644 tests/jit_build/run_tests.sh

diff --git a/README.md b/README.md
index b1d89483e..0f6302655 100644
--- a/README.md
+++ b/README.md
@@ -100,15 +100,12 @@ Note that we recommend restoring the model using the same `opt_level`. Also note
 # Installation
 
 ## Containers
-ROCm pytorch containers are available from https://hub.docker.com/r/rocm/pytorch.
+ROCm pytorch containers contain apex package and these are available from https://hub.docker.com/r/rocm/pytorch.
 
 ## From Source
 
-To install Apex from source, we recommend using the nightly Pytorch obtainable from https://github.com/rocm/pytorch.
+Torch must be installed before installing apex. We recommend using the nightly Pytorch obtainable from https://github.com/rocm/pytorch. The latest stable release obtainable from https://pytorch.org should also work.
 
-The latest stable release obtainable from https://pytorch.org should also work.
-
-## ROCm
 Apex on ROCm supports both python only build and extension build.
 Note: Pytorch version recommended is >=1.5 for extension build.
 
@@ -228,6 +225,37 @@ pip install . --no-build-isolation
 on your system. A Python-only build via `pip install --no-build-isolation -v --no-cache-dir .` is more likely to work.  
 If you installed Pytorch in a Conda environment, make sure to install Apex in that same environment.
 
+# Testing
+
+## L0 tests
+
+```
+cd tests/L0
+sh run_rocm.sh
+```
+
+## contrib tests
+
+```
+cd apex/contrib/test
+python run_rocm_extensions.py
+torchrun --nproc_per_node 8 apex/contrib/peer_memory/peer_halo_exchange_module_tests.py
+```
+
+## Distributed tests
+
+```
+cd tests/distributed/synced_batchnorm
+sh unit_test.sh
+```
+
+## JIT build and then run tests
+
+```
+sh tests/jit_build/build.sh condition 1
+sh tests/jit_build/run_tests.sh condition 1
+```
+where condition is from 1 to 13
 
 # Release notes
 
diff --git a/tests/jit_build/build.sh b/tests/jit_build/build.sh
index f7a4858b1..1cb09af96 100644
--- a/tests/jit_build/build.sh
+++ b/tests/jit_build/build.sh
@@ -2,40 +2,28 @@
 JIT_CONDITION="$2"
 echo "JIT_CONDITION $JIT_CONDITION"
 
-cd ../..
+echo $(pwd)
 
 git checkout Refactor_build
 git submodule update --init --recursive 
 
 # uninstall apex 
-pip uninstall apex 
+pip uninstall apex -y
 make clean
 
 #install apex for different conditions 
 if [ "$JIT_CONDITION" = "1" ]; then
     pip install . --no-build-isolation 
-    LOG_FILE=results_jit_unit_test1.log
-    LOG_FILE2=results_jit_unit_test1c.log
 elif [ "$JIT_CONDITION" = "2" ]; then
     APEX_BUILD_CPP_OPS=1 pip install . --no-build-isolation
-    LOG_FILE=results_jit_unit_test2.log
-    LOG_FILE2=results_jit_unit_test2c.log
 elif [ "$JIT_CONDITION" = "3" ]; then
     APEX_BUILD_CUDA_OPS=1 pip install . --no-build-isolation
-    LOG_FILE=results_jit_unit_test3.log
-    LOG_FILE2=results_jit_unit_test3c.log
 elif [ "$JIT_CONDITION" = "4" ]; then
     APEX_BUILD_CPP_OPS=1 APEX_BUILD_CUDA_OPS=1 pip install . --no-build-isolation
-    LOG_FILE=results_jit_unit_test4.log
-    LOG_FILE2=results_jit_unit_test4c.log
 elif [ "$JIT_CONDITION" = "5" ]; then
     APEX_BUILD_FUSED_DENSE=1 pip install . --no-build-isolation
-    LOG_FILE=results_jit_unit_test5.log
-    LOG_FILE2=results_jit_unit_test5c.log
 elif [ "$JIT_CONDITION" = "6" ]; then
     python setup.py install --cpp_ext --cuda_ext
-    LOG_FILE=results_jit_unit_test6.log
-    LOG_FILE2=results_jit_unit_test6c.log
 elif [ "$JIT_CONDITION" = "7" ]; then
     APEX_BUILD_AMP_C=1 APEX_BUILD_APEX_C=1 APEX_BUILD_BNP=1 \
     APEX_BUILD_DISTRIBUTED_ADAM=1 APEX_BUILD_DISTRIBUTED_LAMB=1 APEX_BUILD_FAST_MULTIHEAD_ATTN=1 \
@@ -46,33 +34,20 @@ elif [ "$JIT_CONDITION" = "7" ]; then
     APEX_BUILD_NCCL_P2P=1 APEX_BUILD_PEER_MEMORY=1 APEX_BUILD_SCALED_MASKED_SOFTMAX_CUDA=1 \
     APEX_BUILD_SCALED_SOFTMAX_CUDA=1 APEX_BUILD_SCALED_UPPER_TRIANG_MASKED_SOFTMAX_CUDA=1 APEX_BUILD_SYNCBN=1 \
     APEX_BUILD_TRANSDUCER_JOINT=1 APEX_BUILD_TRANSDUCER_LOSS=1 APEX_BUILD_XENTROPY=1 pip install . --no-build-isolation
-    LOG_FILE=results_jit_unit_test7.log
-    LOG_FILE2=results_jit_unit_test7c.log
 elif [ "$JIT_CONDITION" = "8" ]; then
     python -m build --wheel --no-isolation .
     pip install dist/apex-*.whl 
-    LOG_FILE=results_jit_unit_test8.log
-    LOG_FILE2=results_jit_unit_test8c.log
 elif [ "$JIT_CONDITION" = "9" ]; then
     APEX_BUILD_CPP_OPS=1 python -m build --wheel --no-isolation .
-    pip install dist/apex-*.whl 
-    LOG_FILE=results_jit_unit_test9.log
-    LOG_FILE2=results_jit_unit_test9c.log
 elif [ "$JIT_CONDITION" = "10" ]; then
     APEX_BUILD_CUDA_OPS=1 python -m build --wheel --no-isolation .
     pip install dist/apex-*.whl 
-    LOG_FILE=results_jit_unit_test10.log
-    LOG_FILE2=results_jit_unit_test10c.log
 elif [ "$JIT_CONDITION" = "11" ]; then
     APEX_BUILD_CPP_OPS=1 APEX_BUILD_CUDA_OPS=1 python -m build --wheel --no-isolation .
     pip install dist/apex-*.whl 
-    LOG_FILE=results_jit_unit_test11.log
-    LOG_FILE2=results_jit_unit_test11c.log
 elif [ "$JIT_CONDITION" = "12" ]; then
     APEX_BUILD_FUSED_DENSE=1 python -m build --wheel --no-isolation .
     pip install dist/apex-*.whl 
-    LOG_FILE=results_jit_unit_test12.log
-    LOG_FILE2=results_jit_unit_test12c.log
 elif [ "$JIT_CONDITION" = "13" ]; then
     APEX_BUILD_AMP_C=1 APEX_BUILD_APEX_C=1 APEX_BUILD_BNP=1 \
     APEX_BUILD_DISTRIBUTED_ADAM=1 APEX_BUILD_DISTRIBUTED_LAMB=1 APEX_BUILD_FAST_MULTIHEAD_ATTN=1 \
@@ -84,6 +59,4 @@ elif [ "$JIT_CONDITION" = "13" ]; then
     APEX_BUILD_SCALED_SOFTMAX_CUDA=1 APEX_BUILD_SCALED_UPPER_TRIANG_MASKED_SOFTMAX_CUDA=1 APEX_BUILD_SYNCBN=1 \
     APEX_BUILD_TRANSDUCER_JOINT=1 APEX_BUILD_TRANSDUCER_LOSS=1 APEX_BUILD_XENTROPY=1 python -m build --wheel --no-isolation .
     pip install dist/apex-*.whl 
-    LOG_FILE=results_jit_unit_test13.log
-    LOG_FILE2=results_jit_unit_test13c.log
 fi 
\ No newline at end of file
diff --git a/tests/jit_build/count_built_so.py b/tests/jit_build/count_built_so.py
new file mode 100644
index 000000000..0381d65f0
--- /dev/null
+++ b/tests/jit_build/count_built_so.py
@@ -0,0 +1,11 @@
+import glob
+import os
+import site
+
+
+SITE_PACKAGES_FOLDERS = site.getsitepackages()[0]
+
+#count the number of *.so files in the folder
+so_files = glob.glob(os.path.join(SITE_PACKAGES_FOLDERS, "**/*.so"), recursive=True)
+count = len(so_files)
+print(count)
diff --git a/tests/jit_build/count_failed_unit_tests.py b/tests/jit_build/count_failed_unit_tests.py
new file mode 100644
index 000000000..9797c3d67
--- /dev/null
+++ b/tests/jit_build/count_failed_unit_tests.py
@@ -0,0 +1,16 @@
+import sys 
+
+test_file = sys.argv[1]
+
+#read lines from test file
+with open(test_file, "r") as f:
+    lines = f.readlines()
+
+failed_tests = []
+for line in lines:
+    if "ERROR: " in line:
+        failed_tests.append(line[7:].strip())
+    if " FAILED" in line:
+        failed_tests.append(line[: -6].strip())
+print(len(failed_tests))
+#print(str(len(failed_tests)) + "," + ";".join(failed_tests)) 
\ No newline at end of file
diff --git a/tests/jit_build/count_torch_extensions.py b/tests/jit_build/count_torch_extensions.py
new file mode 100644
index 000000000..3c8a9fda3
--- /dev/null
+++ b/tests/jit_build/count_torch_extensions.py
@@ -0,0 +1,9 @@
+import os
+
+import torch.utils.cpp_extension
+
+torch_ext_directory = torch.utils.cpp_extension._get_build_directory("", False)
+#count the number of folders
+folders = [f for f in os.listdir(torch_ext_directory) if os.path.isdir(os.path.join(torch_ext_directory, f))]
+count = len(folders)
+print(count)
\ No newline at end of file
diff --git a/tests/jit_build/load_extra_extensions.py b/tests/jit_build/load_extra_extensions.py
new file mode 100644
index 000000000..16d25d2f8
--- /dev/null
+++ b/tests/jit_build/load_extra_extensions.py
@@ -0,0 +1,16 @@
+from apex.op_builder.fused_lamb import FusedLambBuilder
+from apex.op_builder.generic_scaled_masked_softmax_cuda import GenericScaledMaskedSoftmaxCudaBuilder
+from apex.op_builder.scaled_softmax_cuda import ScaledSoftmaxCudaBuilder
+from apex.op_builder.nccl_p2p import NCCLP2PBuilder
+
+'''
+generic_scaled_masked_softmax_cuda
+scaled_softmax_cuda
+fused_lamb_cuda
+nccl_p2p_cuda
+'''
+
+FusedLambBuilder().load()
+GenericScaledMaskedSoftmaxCudaBuilder().load()
+ScaledSoftmaxCudaBuilder().load()
+NCCLP2PBuilder().load()
\ No newline at end of file
diff --git a/tests/jit_build/run_tests.sh b/tests/jit_build/run_tests.sh
new file mode 100644
index 000000000..45e630a5e
--- /dev/null
+++ b/tests/jit_build/run_tests.sh
@@ -0,0 +1,35 @@
+#parse the arguments
+JIT_CONDITION="$2"
+echo "JIT_CONDITION $JIT_CONDITION"
+
+#run the apex unit tests
+LOG_FILE=results_jit_unit_test${JIT_CONDITION}.log
+LOG_FILE2=results_jit_unit_test${JIT_CONDITION}c.log
+
+cd tests/L0 
+PYTHONUNBUFFERED=1 sh run_rocm.sh 2>&1 | tee ../../$LOG_FILE
+cd ../../
+
+cd apex/contrib/test
+PYTHONUNBUFFERED=1 python run_rocm_extensions.py 2>&1 | tee ../../../$LOG_FILE2
+cd ../../../
+
+torchrun --nproc_per_node 8 apex/contrib/peer_memory/peer_halo_exchange_module_tests.py 2>&1 | tee -a ../../../$LOG_FILE2
+
+cd tests/distributed/synced_batchnorm
+sh unit_test.sh 2>&1 | tee -a ../../../$LOG_FILE2
+cd ../../../
+
+#explicitly load the builder and build the remaining extensions
+cd ../
+python run_directory/load_extra_extensions.py 2>&1 | tee ../../../apex/$LOG_FILE
+
+FAILED_TESTS=$(python run_directory/count_failed_unit_tests.py apex/$LOG_FILE)
+FAILED_TESTS2=$(python run_directory/count_failed_unit_tests.py apex/$LOG_FILE2)
+BUILT_SO_COUNT=$(python run_directory/count_built_so.py)
+TORCH_EXTENSIONS_COUNT=$(python run_directory/count_torch_extensions.py)
+
+echo "Failed L0 tests = $FAILED_TESTS"
+echo "Failed contrib tests = $FAILED_TESTS2"
+echo ".so count = $BUILT_SO_COUNT"
+echo "JIT torch extensions count = $TORCH_EXTENSIONS_COUNT"

From cdf3a319a3eb362f4a4733bf0ff6904e4d648bb7 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Tue, 18 Nov 2025 14:51:08 +0000
Subject: [PATCH 66/79] fix the issues with running the tests - improper paths,
 counting .so files in apex folder

---
 tests/jit_build/count_built_so.py          |  3 ++-
 tests/jit_build/count_failed_unit_tests.py |  4 ++--
 tests/jit_build/run_tests.sh               | 13 ++++++-------
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/tests/jit_build/count_built_so.py b/tests/jit_build/count_built_so.py
index 0381d65f0..f671fc3af 100644
--- a/tests/jit_build/count_built_so.py
+++ b/tests/jit_build/count_built_so.py
@@ -6,6 +6,7 @@
 SITE_PACKAGES_FOLDERS = site.getsitepackages()[0]
 
 #count the number of *.so files in the folder
-so_files = glob.glob(os.path.join(SITE_PACKAGES_FOLDERS, "**/*.so"), recursive=True)
+so_files = glob.glob(os.path.join(SITE_PACKAGES_FOLDERS, "apex/*.so"), recursive=True)
+print (so_files)
 count = len(so_files)
 print(count)
diff --git a/tests/jit_build/count_failed_unit_tests.py b/tests/jit_build/count_failed_unit_tests.py
index 9797c3d67..c6d95d3ea 100644
--- a/tests/jit_build/count_failed_unit_tests.py
+++ b/tests/jit_build/count_failed_unit_tests.py
@@ -10,7 +10,7 @@
 for line in lines:
     if "ERROR: " in line:
         failed_tests.append(line[7:].strip())
-    if " FAILED" in line:
-        failed_tests.append(line[: -6].strip())
+    if " FAILED" in line and "#" not in line:
+        failed_tests.append(line[: -8].strip())
 print(len(failed_tests))
 #print(str(len(failed_tests)) + "," + ";".join(failed_tests)) 
\ No newline at end of file
diff --git a/tests/jit_build/run_tests.sh b/tests/jit_build/run_tests.sh
index 45e630a5e..32c6977c1 100644
--- a/tests/jit_build/run_tests.sh
+++ b/tests/jit_build/run_tests.sh
@@ -14,20 +14,19 @@ cd apex/contrib/test
 PYTHONUNBUFFERED=1 python run_rocm_extensions.py 2>&1 | tee ../../../$LOG_FILE2
 cd ../../../
 
-torchrun --nproc_per_node 8 apex/contrib/peer_memory/peer_halo_exchange_module_tests.py 2>&1 | tee -a ../../../$LOG_FILE2
+torchrun --nproc_per_node 8 apex/contrib/peer_memory/peer_halo_exchange_module_tests.py 2>&1 | tee -a $LOG_FILE2
 
 cd tests/distributed/synced_batchnorm
 sh unit_test.sh 2>&1 | tee -a ../../../$LOG_FILE2
 cd ../../../
 
 #explicitly load the builder and build the remaining extensions
-cd ../
-python run_directory/load_extra_extensions.py 2>&1 | tee ../../../apex/$LOG_FILE
+python tests/jit_build/load_extra_extensions.py 2>&1 | tee $LOG_FILE
 
-FAILED_TESTS=$(python run_directory/count_failed_unit_tests.py apex/$LOG_FILE)
-FAILED_TESTS2=$(python run_directory/count_failed_unit_tests.py apex/$LOG_FILE2)
-BUILT_SO_COUNT=$(python run_directory/count_built_so.py)
-TORCH_EXTENSIONS_COUNT=$(python run_directory/count_torch_extensions.py)
+FAILED_TESTS=$(python tests/jit_build/count_failed_unit_tests.py $LOG_FILE)
+FAILED_TESTS2=$(python tests/jit_build/count_failed_unit_tests.py $LOG_FILE2)
+BUILT_SO_COUNT=$(python tests/jit_build/count_built_so.py)
+TORCH_EXTENSIONS_COUNT=$(python tests/jit_build/count_torch_extensions.py)
 
 echo "Failed L0 tests = $FAILED_TESTS"
 echo "Failed contrib tests = $FAILED_TESTS2"

From 22b5340d8d6868156f1d7657bd884d8dd30cee9a Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Tue, 18 Nov 2025 15:38:20 +0000
Subject: [PATCH 67/79] add mad internal scripts

---
 tests/jit_build/build_test.sh                 |   5 +
 .../docker/base.ubuntu.amd.Dockerfile         |   3 +
 tests/jit_build/models.json                   | 171 ++++++++++++++++++
 tests/jit_build/run_tests.sh                  |   2 +
 tests/jit_build/scripts/run.sh                |  25 +++
 5 files changed, 206 insertions(+)
 create mode 100644 tests/jit_build/build_test.sh
 create mode 100644 tests/jit_build/docker/base.ubuntu.amd.Dockerfile
 create mode 100644 tests/jit_build/models.json
 create mode 100644 tests/jit_build/scripts/run.sh

diff --git a/tests/jit_build/build_test.sh b/tests/jit_build/build_test.sh
new file mode 100644
index 000000000..5e61b696c
--- /dev/null
+++ b/tests/jit_build/build_test.sh
@@ -0,0 +1,5 @@
+#parse the arguments
+JIT_CONDITION="$2"
+
+sh tests/jit_build/build.sh "condition" $JIT_CONDITION
+sh tests/jit_build/run_tests.sh "condition" $JIT_CONDITION
\ No newline at end of file
diff --git a/tests/jit_build/docker/base.ubuntu.amd.Dockerfile b/tests/jit_build/docker/base.ubuntu.amd.Dockerfile
new file mode 100644
index 000000000..b825ba05e
--- /dev/null
+++ b/tests/jit_build/docker/base.ubuntu.amd.Dockerfile
@@ -0,0 +1,3 @@
+# CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'}
+ARG BASE_DOCKER=rocm/pytorch
+FROM $BASE_DOCKER
\ No newline at end of file
diff --git a/tests/jit_build/models.json b/tests/jit_build/models.json
new file mode 100644
index 000000000..4c46a46e2
--- /dev/null
+++ b/tests/jit_build/models.json
@@ -0,0 +1,171 @@
+[
+    {
+        "name": "apex_jit_install_condition1",
+        "dockerfile": "docker/base",
+        "scripts": "scripts",
+        "n_gpus": "8",
+        "owner": "skishore@amd.com",
+        "multiple_results": "results_jit_unit_test.csv",
+        "training_precision": "",
+        "tags": [
+          "apex_jit"
+        ],
+        "args": "--condition 1"
+    },
+    {
+        "name": "apex_jit_install_condition2",
+        "dockerfile": "docker/base",
+        "scripts": "scripts",
+        "n_gpus": "8",
+        "owner": "skishore@amd.com",
+        "multiple_results": "results_jit_unit_test.csv",
+        "training_precision": "",
+        "tags": [
+          "apex_jit"
+        ],
+        "args": "--condition 2"
+    },
+    {
+        "name": "apex_jit_install_condition3",
+        "dockerfile": "docker/base",
+        "scripts": "scripts",
+        "n_gpus": "8",
+        "owner": "skishore@amd.com",
+        "multiple_results": "results_jit_unit_test.csv",
+        "training_precision": "",
+        "tags": [
+          "apex_jit"
+        ],
+        "args": "--condition 3"
+    },
+    {
+        "name": "apex_jit_install_condition4",
+        "dockerfile": "docker/base",
+        "scripts": "scripts",
+        "n_gpus": "8",
+        "owner": "skishore@amd.com",
+        "multiple_results": "results_jit_unit_test.csv",
+        "training_precision": "",
+        "tags": [
+          "apex_jit"
+        ],
+        "args": "--condition 4"
+    },
+    {
+        "name": "apex_jit_install_condition5",
+        "dockerfile": "docker/base",
+        "scripts": "scripts",
+        "n_gpus": "8",
+        "owner": "skishore@amd.com",
+        "multiple_results": "results_jit_unit_test.csv",
+        "training_precision": "",
+        "tags": [
+          "apex_jit"
+        ],
+        "args": "--condition 5"
+    },
+    {
+        "name": "apex_jit_install_condition6",
+        "dockerfile": "docker/base",
+        "scripts": "scripts",
+        "n_gpus": "8",
+        "owner": "skishore@amd.com",
+        "multiple_results": "results_jit_unit_test.csv",
+        "training_precision": "",
+        "tags": [
+          "apex_jit"
+        ],
+        "args": "--condition 6"
+    },
+    {
+        "name": "apex_jit_install_condition7",
+        "dockerfile": "docker/base",
+        "scripts": "scripts",
+        "n_gpus": "8",
+        "owner": "skishore@amd.com",
+        "multiple_results": "results_jit_unit_test.csv",
+        "training_precision": "",
+        "tags": [
+          "apex_jit"
+        ],
+        "args": "--condition 7"
+    },
+    {
+        "name": "apex_jit_install_condition8",
+        "dockerfile": "docker/base",
+        "scripts": "scripts",
+        "n_gpus": "8",
+        "owner": "skishore@amd.com",
+        "multiple_results": "results_jit_unit_test.csv",
+        "training_precision": "",
+        "tags": [
+          "apex_jit"
+        ],
+        "args": "--condition 8"
+    },
+    {
+        "name": "apex_jit_install_condition9",
+        "dockerfile": "docker/base",
+        "scripts": "scripts",
+        "n_gpus": "8",
+        "owner": "skishore@amd.com",
+        "multiple_results": "results_jit_unit_test.csv",
+        "training_precision": "",
+        "tags": [
+          "apex_jit"
+        ],
+        "args": "--condition 9"
+    },
+    {
+        "name": "apex_jit_install_condition10",
+        "dockerfile": "docker/base",
+        "scripts": "scripts",
+        "n_gpus": "8",
+        "owner": "skishore@amd.com",
+        "multiple_results": "results_jit_unit_test.csv",
+        "training_precision": "",
+        "tags": [
+          "apex_jit"
+        ],
+        "args": "--condition 10"
+    },
+    {
+        "name": "apex_jit_install_condition11",
+        "dockerfile": "docker/base",
+        "scripts": "scripts",
+        "n_gpus": "8",
+        "owner": "skishore@amd.com",
+        "multiple_results": "results_jit_unit_test.csv",
+        "training_precision": "",
+        "tags": [
+          "apex_jit"
+        ],
+        "args": "--condition 11"
+    },
+    {
+        "name": "apex_jit_install_condition12",
+        "dockerfile": "docker/base",
+        "scripts": "scripts",
+        "n_gpus": "8",
+        "owner": "skishore@amd.com",
+        "multiple_results": "results_jit_unit_test.csv",
+        "training_precision": "",
+        "tags": [
+          "apex_jit"
+        ],
+        "args": "--condition 12"
+    },
+    {
+        "name": "apex_jit_install_condition13",
+        "dockerfile": "docker/base",
+        "scripts": "scripts",
+        "n_gpus": "8",
+        "owner": "skishore@amd.com",
+        "multiple_results": "results_jit_unit_test.csv",
+        "training_precision": "",
+        "tags": [
+          "apex_jit"
+        ],
+        "args": "--condition 13"
+    }
+]
\ No newline at end of file
diff --git a/tests/jit_build/run_tests.sh b/tests/jit_build/run_tests.sh
index 32c6977c1..eaed64629 100644
--- a/tests/jit_build/run_tests.sh
+++ b/tests/jit_build/run_tests.sh
@@ -32,3 +32,5 @@ echo "Failed L0 tests = $FAILED_TESTS"
 echo "Failed contrib tests = $FAILED_TESTS2"
 echo ".so count = $BUILT_SO_COUNT"
 echo "JIT torch extensions count = $TORCH_EXTENSIONS_COUNT"
+
+echo "$FAILED_TESTS $FAILED_TESTS2 $BUILT_SO_COUNT $TORCH_EXTENSIONS_COUNT"
\ No newline at end of file
diff --git a/tests/jit_build/scripts/run.sh b/tests/jit_build/scripts/run.sh
new file mode 100644
index 000000000..d7a06784b
--- /dev/null
+++ b/tests/jit_build/scripts/run.sh
@@ -0,0 +1,25 @@
+#parse the arguments
+JIT_CONDITION="$2"
+
+echo $(pwd)
+
+WORKSPACE_DIR=/myworkspace
+mkdir -p $WORKSPACE_DIR 
+
+cd $WORKSPACE_DIR  
+git clone https://github.com/rocm/apex.git --recursive 
+cd apex 
+git checkout Refactor_build
+git submodule update --init --recursive 
+
+sh tests/jit_build/build.sh "condition" $JIT_CONDITION
+
+# Capture the output from run_tests.sh
+TEST_RESULTS=$(sh tests/jit_build/run_tests.sh "condition" $JIT_CONDITION | tail -1)
+
+# Parse the returned values
+read FAILED_TESTS FAILED_TESTS2 BUILT_SO_COUNT TORCH_EXTENSIONS_COUNT <<< "$TEST_RESULTS"
+
+MULTIPLE_RESULTS_FILE="results_jit_unit_test.csv"
+#echo "condition,failed unit tests" > "$MULTIPLE_RESULTS_FILE"
+echo "$JIT_CONDITION,$FAILED_TESTS,$FAILED_TESTS2,$BUILT_SO_COUNT,$TORCH_EXTENSIONS_COUNT" >> "$MULTIPLE_RESULTS_FILE"
\ No newline at end of file

From d569d5d45f23921a278342c141de9cddfbe5b535 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Tue, 18 Nov 2025 16:48:56 +0000
Subject: [PATCH 68/79] remove print statement

---
 tests/jit_build/count_built_so.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/jit_build/count_built_so.py b/tests/jit_build/count_built_so.py
index f671fc3af..353034acb 100644
--- a/tests/jit_build/count_built_so.py
+++ b/tests/jit_build/count_built_so.py
@@ -7,6 +7,5 @@
 
 #count the number of *.so files in the folder
 so_files = glob.glob(os.path.join(SITE_PACKAGES_FOLDERS, "apex/*.so"), recursive=True)
-print (so_files)
 count = len(so_files)
 print(count)

From 84ccba85e3fbc23239eaf16974013b816f0897aa Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Tue, 18 Nov 2025 18:36:11 +0000
Subject: [PATCH 69/79] remove testing section from readme

---
 README.md | 32 --------------------------------
 1 file changed, 32 deletions(-)

diff --git a/README.md b/README.md
index 0f6302655..e86209a80 100644
--- a/README.md
+++ b/README.md
@@ -225,38 +225,6 @@ pip install . --no-build-isolation
 on your system. A Python-only build via `pip install --no-build-isolation -v --no-cache-dir .` is more likely to work.  
 If you installed Pytorch in a Conda environment, make sure to install Apex in that same environment.
 
-# Testing
-
-## L0 tests
-
-```
-cd tests/L0
-sh run_rocm.sh
-```
-
-## contrib tests
-
-```
-cd apex/contrib/test
-python run_rocm_extensions.py
-torchrun --nproc_per_node 8 apex/contrib/peer_memory/peer_halo_exchange_module_tests.py
-```
-
-## Distributed tests
-
-```
-cd tests/distributed/synced_batchnorm
-sh unit_test.sh
-```
-
-## JIT build and then run tests
-
-```
-sh tests/jit_build/build.sh condition 1
-sh tests/jit_build/run_tests.sh condition 1
-```
-where condition is from 1 to 13
-
 # Release notes
 
 ## release/1.7.0

From a60e20018bff50f64b3e785cb4c80c12f3c15541 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Wed, 19 Nov 2025 07:59:46 +0000
Subject: [PATCH 70/79] change location of result file

---
 tests/jit_build/scripts/run.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/jit_build/scripts/run.sh b/tests/jit_build/scripts/run.sh
index d7a06784b..aeb41fadd 100644
--- a/tests/jit_build/scripts/run.sh
+++ b/tests/jit_build/scripts/run.sh
@@ -20,6 +20,6 @@ TEST_RESULTS=$(sh tests/jit_build/run_tests.sh "condition" $JIT_CONDITION | tail
 # Parse the returned values
 read FAILED_TESTS FAILED_TESTS2 BUILT_SO_COUNT TORCH_EXTENSIONS_COUNT <<< "$TEST_RESULTS"
 
-MULTIPLE_RESULTS_FILE="results_jit_unit_test.csv"
+MULTIPLE_RESULTS_FILE="../results_jit_unit_test.csv"
 #echo "condition,failed unit tests" > "$MULTIPLE_RESULTS_FILE"
 echo "$JIT_CONDITION,$FAILED_TESTS,$FAILED_TESTS2,$BUILT_SO_COUNT,$TORCH_EXTENSIONS_COUNT" >> "$MULTIPLE_RESULTS_FILE"
\ No newline at end of file

From 5df477c30b193f422223b912c8fe5fdc2cc100f9 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Wed, 19 Nov 2025 08:41:55 +0000
Subject: [PATCH 71/79] remove multiple results file from models.json

---
 tests/jit_build/models.json | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/tests/jit_build/models.json b/tests/jit_build/models.json
index 4c46a46e2..72963295b 100644
--- a/tests/jit_build/models.json
+++ b/tests/jit_build/models.json
@@ -5,7 +5,6 @@
         "scripts": "scripts",
         "n_gpus": "8",
         "owner": "skishore@amd.com",
-        "multiple_results": "results_jit_unit_test.csv",
         "training_precision": "",
         "tags": [
           "apex_jit"
@@ -18,7 +17,6 @@
         "scripts": "scripts",
         "n_gpus": "8",
         "owner": "skishore@amd.com",
-        "multiple_results": "results_jit_unit_test.csv",
         "training_precision": "",
         "tags": [
           "apex_jit"
@@ -31,7 +29,6 @@
         "scripts": "scripts",
         "n_gpus": "8",
         "owner": "skishore@amd.com",
-        "multiple_results": "results_jit_unit_test.csv",
         "training_precision": "",
         "tags": [
           "apex_jit"
@@ -44,7 +41,6 @@
         "scripts": "scripts",
         "n_gpus": "8",
         "owner": "skishore@amd.com",
-        "multiple_results": "results_jit_unit_test.csv",
         "training_precision": "",
         "tags": [
           "apex_jit"
@@ -57,7 +53,6 @@
         "scripts": "scripts",
         "n_gpus": "8",
         "owner": "skishore@amd.com",
-        "multiple_results": "results_jit_unit_test.csv",
         "training_precision": "",
         "tags": [
           "apex_jit"
@@ -70,7 +65,6 @@
         "scripts": "scripts",
         "n_gpus": "8",
         "owner": "skishore@amd.com",
-        "multiple_results": "results_jit_unit_test.csv",
         "training_precision": "",
         "tags": [
           "apex_jit"
@@ -83,7 +77,6 @@
         "scripts": "scripts",
         "n_gpus": "8",
         "owner": "skishore@amd.com",
-        "multiple_results": "results_jit_unit_test.csv",
         "training_precision": "",
         "tags": [
           "apex_jit"
@@ -96,7 +89,6 @@
         "scripts": "scripts",
         "n_gpus": "8",
         "owner": "skishore@amd.com",
-        "multiple_results": "results_jit_unit_test.csv",
         "training_precision": "",
         "tags": [
           "apex_jit"
@@ -109,7 +101,6 @@
         "scripts": "scripts",
         "n_gpus": "8",
         "owner": "skishore@amd.com",
-        "multiple_results": "results_jit_unit_test.csv",
         "training_precision": "",
         "tags": [
           "apex_jit"
@@ -122,7 +113,6 @@
         "scripts": "scripts",
         "n_gpus": "8",
         "owner": "skishore@amd.com",
-        "multiple_results": "results_jit_unit_test.csv",
         "training_precision": "",
         "tags": [
           "apex_jit"
@@ -135,7 +125,6 @@
         "scripts": "scripts",
         "n_gpus": "8",
         "owner": "skishore@amd.com",
-        "multiple_results": "results_jit_unit_test.csv",
         "training_precision": "",
         "tags": [
           "apex_jit"
@@ -148,7 +137,6 @@
         "scripts": "scripts",
         "n_gpus": "8",
         "owner": "skishore@amd.com",
-        "multiple_results": "results_jit_unit_test.csv",
         "training_precision": "",
         "tags": [
           "apex_jit"
@@ -161,7 +149,6 @@
         "scripts": "scripts",
         "n_gpus": "8",
         "owner": "skishore@amd.com",
-        "multiple_results": "results_jit_unit_test.csv",
         "training_precision": "",
         "tags": [
           "apex_jit"

From c7588417ce372df9040dbe4ea783bd004b901071 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Sat, 22 Nov 2025 10:27:32 +0000
Subject: [PATCH 72/79] add platform specific description to wheel name even if
 no CppExtension or CUDAExtension is built with JIT load approach

---
 setup.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 2614fa3df..84e2d6294 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@
 import glob
 from packaging.version import parse, Version
 
-from setuptools import setup, find_packages
+from setuptools import setup, find_packages, Distribution
 import subprocess
 
 import torch
@@ -286,7 +286,10 @@ def is_op_build_included(op_name):
 else:
     print("Warning: compatibility folder not found")
 
-print ("-----py_modules--------", py_modules)
+class BinaryDistribution(Distribution):
+    """Force wheel to be platform-specific even without ext_modules."""
+    def has_ext_modules(self):
+        return True
 
 setup(
     name="apex",
@@ -300,7 +303,8 @@ def is_op_build_included(op_name):
     extras_require=extras,
     install_requires=required,
     include_package_data=True,
-    py_modules=py_modules
+    py_modules=py_modules,
+    distclass=BinaryDistribution
 )
 
 #delete the temporarily copied compatibility files

From 484358cc57e21fc9146605ac5a2dfb19ce1acc69 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Tue, 25 Nov 2025 19:13:42 +0000
Subject: [PATCH 73/79] add ninja and wheel to requirements to be installed

---
 requirements.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 478362844..d527b4498 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,4 +7,6 @@ packaging>=14.0
 matplotlib>=3.8
 pandas>=2.2.2
 py-cpuinfo
-build
\ No newline at end of file
+build
+ninja
+wheel
\ No newline at end of file

From 6388f5ad12950abba005697da32fc22d0325efd6 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Tue, 25 Nov 2025 20:46:46 +0000
Subject: [PATCH 74/79] Update Release notes in Readme

---
 README.md | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/README.md b/README.md
index e86209a80..33d2213e1 100644
--- a/README.md
+++ b/README.md
@@ -227,11 +227,32 @@ If you installed Pytorch in a Conda environment, make sure to install Apex in th
 
 # Release notes
 
+## release/1.10.0
+
+Build and installation related
+- Support JIT (just-in-time) load cpp and CUDA extensions
+
+## release/1.9.0
+
+- No new features were added in this release cycle.
+
+## release/1.8.0
+
+Unit test related
+- Fix transformer unit tests
+- Fix fused dense gelu dense unit tests
+
 ## release/1.7.0
 
+Build and installation related
+- Support use of BUILD_VERSION environment to override version.txt when creating apex wheels
+- Disable aiter installation by default. make aiter command is used to build apex
+
 Unit test related
 - Include running transformer tests in L0/run_test.py
 - Fix transformer unit tests
+- Fix batch norm unit tests
+- Fix fused dense gelu dense unit tests
 
 ## release/1.6.0
 

From 929f4adb1644248d58a8749bc052ae6057cbb5d6 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Tue, 25 Nov 2025 21:04:37 +0000
Subject: [PATCH 75/79] Exclude compatibility folder while installing apex

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 84e2d6294..816849e7c 100644
--- a/setup.py
+++ b/setup.py
@@ -295,7 +295,7 @@ def has_ext_modules(self):
     name="apex",
     version=get_apex_version(),
     packages=find_packages(
-        exclude=("build", "include", "tests", "dist", "docs", "tests", "examples", "apex.egg-info", "op_builder")
+        exclude=("build", "include", "tests", "dist", "docs", "tests", "examples", "apex.egg-info", "op_builder", "compatibility")
     ),
     description="PyTorch Extensions written by NVIDIA",
     ext_modules=ext_modules,

From e16c45b13a2fde2dc6bad59db1f242684c63063c Mon Sep 17 00:00:00 2001
From: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
Date: Wed, 26 Nov 2025 15:40:43 -0600
Subject: [PATCH 76/79] Update README.md

---
 README.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 33d2213e1..b332bcc25 100644
--- a/README.md
+++ b/README.md
@@ -109,7 +109,8 @@ Torch must be installed before installing apex. We recommend using the nightly P
 Apex on ROCm supports both python only build and extension build.
 Note: Pytorch version recommended is >=1.5 for extension build.
 
-### To install and jit load the extensions use the following command in apex folder:
+### The following command will install all the extensions, which will be built and linked at runtime using [PyTorch's JIT (just-in-time) loader](https://pytorch.org/docs/stable/cpp_extension.html):
+This requires ninja to be installed
 ```
 pip install . --no-build-isolation
 ```
@@ -136,12 +137,12 @@ ubuntu|pytorch|apex|release/1.0.0|06c33eee43f7a22f3ed7d9c3e5be0ddd757dc345|https
 centos|pytorch|apex|release/1.0.0|06c33eee43f7a22f3ed7d9c3e5be0ddd757dc345|https://github.com/ROCmSoftwarePlatform/apex
 ```
 
-### To install the extensions while installing use the following command in apex folder:
+### To build and install all the supported extensions while installing apex, use the following command in apex folder:
 ```
 APEX_BUILD_CPP_OPS=1 APEX_BUILD_CUDA_OPS=1 pip install . --no-build-isolation
 ```
 
-It is possible to build specific extensions by using the following command in apex folder:
+It is possible to build and install specific extensions by using the following command in apex folder:
 ```
 APEX_BUILD_<OP_NAME>=1 pip install . --no-build-isolation
 ```

From b52cb46400584e9dd26d85bf51de938111913671 Mon Sep 17 00:00:00 2001
From: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
Date: Wed, 26 Nov 2025 15:43:00 -0600
Subject: [PATCH 77/79] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index b332bcc25..607efefab 100644
--- a/README.md
+++ b/README.md
@@ -137,12 +137,12 @@ ubuntu|pytorch|apex|release/1.0.0|06c33eee43f7a22f3ed7d9c3e5be0ddd757dc345|https
 centos|pytorch|apex|release/1.0.0|06c33eee43f7a22f3ed7d9c3e5be0ddd757dc345|https://github.com/ROCmSoftwarePlatform/apex
 ```
 
-### To build and install all the supported extensions while installing apex, use the following command in apex folder:
+### To pre-build and install all the supported extensions while installing apex, use the following command in apex folder:
 ```
 APEX_BUILD_CPP_OPS=1 APEX_BUILD_CUDA_OPS=1 pip install . --no-build-isolation
 ```
 
-It is possible to build and install specific extensions by using the following command in apex folder:
+It is also possible to pre-build and install specific extensions by using the following command in apex folder:
 ```
 APEX_BUILD_<OP_NAME>=1 pip install . --no-build-isolation
 ```

From 3f8f4fd9194722308077854d8be27adb521bc4df Mon Sep 17 00:00:00 2001
From: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
Date: Wed, 26 Nov 2025 15:47:39 -0600
Subject: [PATCH 78/79] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 607efefab..81b647993 100644
--- a/README.md
+++ b/README.md
@@ -181,7 +181,7 @@ For example, to build FUSED_DENSE​ you can use the following command:
 ```
 APEX_BUILD_FUSED_DENSE​=1 pip install . --no-build-isolation
 ```
-This will install FUSED_DENSE​ module and rest of the modules are JIT loaded. 
+This will pre-build and install FUSED_DENSE​ module and rest of the modules are installed to be JIT built and loaded at runtime. 
 
 
 
@@ -192,7 +192,7 @@ make aiter
 
 To use aiter in fused rope, you can use the flag ```USE_ROCM_AITER_ROPE_BACKEND=1```.
 
-### To create a wheel and then install using the wheel, use the following command in apex folder:
+### To create a wheel and then install apex using the wheel, use the following command in apex folder:
 ```
 python -m build --wheel --no-isolation (can use the same environment variables to build specific extensions, cpp extensions and cuda extensions)
 pip install dist/apex-*.whl​

From 5920a1bfabf427e57e6d377da538422ba2f40375 Mon Sep 17 00:00:00 2001
From: sriram <sriram.kumar@silo.ai>
Date: Mon, 1 Dec 2025 13:19:50 +0200
Subject: [PATCH 79/79] Adding modification note to the original copywrite

---
 apex/git_version_info.py | 4 ++++
 op_builder/__init__.py   | 4 ++++
 op_builder/all_ops.py    | 4 ++++
 op_builder/builder.py    | 4 ++++
 4 files changed, 16 insertions(+)

diff --git a/apex/git_version_info.py b/apex/git_version_info.py
index 3b20d4d39..ee9e7c6c7 100644
--- a/apex/git_version_info.py
+++ b/apex/git_version_info.py
@@ -3,6 +3,10 @@
 
 # DeepSpeed Team
 
+# Portions of this code were adapted from DeepSpeed:
+# https://github.com/microsoft/DeepSpeed
+# Modified for ROCm Apex
+
 try:
     #  This is populated by setup.py
     from .git_version_info_installed import *  # noqa: F401 # type: ignore
diff --git a/op_builder/__init__.py b/op_builder/__init__.py
index 34dbd96c3..726ec6f4d 100644
--- a/op_builder/__init__.py
+++ b/op_builder/__init__.py
@@ -3,6 +3,10 @@
 
 # DeepSpeed Team
 
+# Portions of this code were adapted from DeepSpeed:
+# https://github.com/microsoft/DeepSpeed
+# Modified for ROCm Apex
+
 import sys
 import os
 import pkgutil
diff --git a/op_builder/all_ops.py b/op_builder/all_ops.py
index 2c0f9166d..e18dbdd71 100644
--- a/op_builder/all_ops.py
+++ b/op_builder/all_ops.py
@@ -3,6 +3,10 @@
 
 # DeepSpeed Team
 
+# Portions of this code were adapted from DeepSpeed:
+# https://github.com/microsoft/DeepSpeed
+# Modified for ROCm Apex
+
 import os
 import pkgutil
 import importlib
diff --git a/op_builder/builder.py b/op_builder/builder.py
index 9b8e78502..60e490b2b 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -3,6 +3,10 @@
 
 # DeepSpeed Team
 
+# Portions of this code were adapted from DeepSpeed:
+# https://github.com/microsoft/DeepSpeed
+# Modified for ROCm Apex
+
 import os
 import re
 import sys