diff --git a/.gitignore b/.gitignore
index 5fe868b36..da67982aa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -148,3 +148,7 @@ cython_debug/
 *.hip
 *_hip.*
 *hip* 
+
+
+#file temporarily created for build process
+apex/git_version_info_installed.py
\ No newline at end of file
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 000000000..a5dc0456c
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,2 @@
+recursive-include apex/contrib/csrc *
+recursive-include apex/csrc *
\ No newline at end of file
diff --git a/Makefile b/Makefile
new file mode 100644
index 000000000..99e44805f
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,17 @@
+PYTHON = python3
+PIP = $(PYTHON) -m pip
+
+clean: # This will remove ALL build folders.
+	@test -d build/ && echo "Deleting build folder" || true
+	@test -d build/ && rm -r build/ || true
+	@test -d dist/ && echo "Deleting dist folder" || true
+	@test -d dist/ && rm -r dist/ || true
+	@test -d apex.egg-info/ && echo "Deleting apex.egg-info folder" || true
+	@test -d apex.egg-info/ && rm -r apex.egg-info/ || true
+
+	$(PYTHON) scripts/clean.py # remove the apex extensions installed at torch extensions folder 
+
+aiter:
+	$(PIP) uninstall -y aiter
+	cd third_party/aiter && $(PIP) install . --no-build-isolation --no-deps
+	
diff --git a/README.md b/README.md
index 66bead147..81b647993 100644
--- a/README.md
+++ b/README.md
@@ -100,24 +100,21 @@ Note that we recommend restoring the model using the same `opt_level`. Also note
 # Installation
 
 ## Containers
-ROCm pytorch containers are available from https://hub.docker.com/r/rocm/pytorch.
+ROCm pytorch containers contain apex package and these are available from https://hub.docker.com/r/rocm/pytorch.
 
 ## From Source
 
-To install Apex from source, we recommend using the nightly Pytorch obtainable from https://github.com/rocm/pytorch.
+Torch must be installed before installing apex. We recommend using the nightly Pytorch obtainable from https://github.com/rocm/pytorch. The latest stable release obtainable from https://pytorch.org should also work.
 
-The latest stable release obtainable from https://pytorch.org should also work.
-
-## ROCm
 Apex on ROCm supports both python only build and extension build.
 Note: Pytorch version recommended is >=1.5 for extension build.
 
-### To install using python only build use the following command in apex folder:
+### The following command will install all the extensions, which will be built and linked at runtime using [PyTorch's JIT (just-in-time) loader](https://pytorch.org/docs/stable/cpp_extension.html):
+This requires ninja to be installed
 ```
-python setup.py install
+pip install . --no-build-isolation
 ```
 
-=======
 ### Supported Versions
 | ``APEX Version`` | ``APEX branch`` | ``Torch Version`` |
 |------------------|-----------------|-------------------|
@@ -140,26 +137,73 @@ ubuntu|pytorch|apex|release/1.0.0|06c33eee43f7a22f3ed7d9c3e5be0ddd757dc345|https
 centos|pytorch|apex|release/1.0.0|06c33eee43f7a22f3ed7d9c3e5be0ddd757dc345|https://github.com/ROCmSoftwarePlatform/apex
 ```
 
-### To install using extensions enabled use the following command in apex folder:
+### To pre-build and install all the supported extensions while installing apex, use the following command in apex folder:
+```
+APEX_BUILD_CPP_OPS=1 APEX_BUILD_CUDA_OPS=1 pip install . --no-build-isolation
 ```
-# if pip >= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key...
-pip install -v --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
-# otherwise
-python setup.py install --cpp_ext --cuda_ext
 
+It is also possible to pre-build and install specific extensions by using the following command in apex folder:
+```
+APEX_BUILD_<OP_NAME>=1 pip install . --no-build-isolation
 ```
-Note that using --cuda_ext flag to install Apex will also enable all the extensions supported on ROCm including "--distributed_adam", "--distributed_lamb", "--bnp", "--xentropy", "--deprecated_fused_adam", "--deprecated_fused_lamb", and "--fast_multihead_attn".
+The following extensions are supported:
+| extension | environment to build specific extension | install option |
+|-----------|-----------|-----------|
+| amp_C | APEX_BUILD_AMP_C=1 | APEX_BUILD_CUDA_OPS=1 |
+| apex_C | APEX_BUILD_APEX_C=1 | APEX_BUILD_CPP_OPS=1 |
+| bnp | APEX_BUILD_BNP=1 | APEX_BUILD_CUDA_OPS=1 |
+| distributed_adam_cuda | APEX_BUILD_DISTRIBUTED_ADAM=1 | APEX_BUILD_CUDA_OPS=1 |
+| distributed_lamb_cuda | APEX_BUILD_DISTRIBUTED_LAMB=1 | APEX_BUILD_CUDA_OPS=1 |
+| fast_multihead_attn | APEX_BUILD_FAST_MULTIHEAD_ATTN=1 | APEX_BUILD_CUDA_OPS=1 |
+| focal_loss_cuda | APEX_BUILD_FOCAL_LOSS=1 | APEX_BUILD_CUDA_OPS=1 |
+| fused_adam_cuda | APEX_BUILD_FUSED_ADAM=1 | APEX_BUILD_CUDA_OPS=1 |
+| fused_bias_swiglu | APEX_BUILD_FUSED_BIAS_SWIGLU=1 | APEX_BUILD_CUDA_OPS=1 |
+| fused_dense_cuda | APEX_BUILD_FUSED_DENSE=1 | APEX_BUILD_CUDA_OPS=1 |
+| fused_index_mul_2d | APEX_BUILD_FUSED_INDEX_MUL_2D=1 | APEX_BUILD_CUDA_OPS=1 |
+| fused_lamb_cuda | APEX_BUILD_FUSED_LAMB=1 | APEX_BUILD_CUDA_OPS=1 |
+| fused_layer_norm_cuda | APEX_BUILD_FUSED_LAYER_NORM=1 | APEX_BUILD_CUDA_OPS=1 |
+| fused_rotary_positional_embedding | APEX_BUILD_FUSED_ROPE=1 | APEX_BUILD_CUDA_OPS=1 |
+| fused_weight_gradient_mlp_cuda | APEX_BUILD_FUSED_WEIGHT_GRADIENT_MLP=1 | APEX_BUILD_CUDA_OPS=1 |
+| generic_scaled_masked_softmax_cuda | APEX_BUILD_GENERIC_SCALED_MASKED_SOFTMAX_CUDA=1 | APEX_BUILD_CUDA_OPS=1 |
+| mlp_cuda | APEX_BUILD_MLP=1 | APEX_BUILD_CUDA_OPS=1 |
+| _apex_nccl_allocator | APEX_BUILD_NCCL_ALLOCATOR=1 | APEX_BUILD_CUDA_OPS=1 |
+| nccl_p2p_cuda | APEX_BUILD_NCCL_P2P=1 | APEX_BUILD_CUDA_OPS=1 |
+| peer_memory_cuda | APEX_BUILD_PEER_MEMORY=1 | APEX_BUILD_CUDA_OPS=1 |
+| scaled_masked_softmax_cuda | APEX_BUILD_SCALED_MASKED_SOFTMAX_CUDA=1 | APEX_BUILD_CUDA_OPS=1 |
+| scaled_softmax_cuda | APEX_BUILD_SCALED_SOFTMAX_CUDA=1 | APEX_BUILD_CUDA_OPS=1 |
+| scaled_upper_triang_masked_softmax_cuda | APEX_BUILD_SCALED_UPPER_TRIANG_MASKED_SOFTMAX_CUDA=1 | APEX_BUILD_CUDA_OPS=1 |
+| syncbn | APEX_BUILD_SYNCBN=1 | APEX_BUILD_CUDA_OPS=1 |
+| transducer_joint_cuda | APEX_BUILD_TRANSDUCER_JOINT=1 | APEX_BUILD_CUDA_OPS=1 |
+| transducer_loss_cuda | APEX_BUILD_TRANSDUCER_LOSS=1 | APEX_BUILD_CUDA_OPS=1 |
+| xentropy_cuda | APEX_BUILD_XENTROPY=1 | APEX_BUILD_CUDA_OPS=1 |
+
+For example, to build FUSED_DENSE​ you can use the following command: 
+```
+APEX_BUILD_FUSED_DENSE​=1 pip install . --no-build-isolation
+```
+This will pre-build and install FUSED_DENSE​ module and rest of the modules are installed to be JIT built and loaded at runtime. 
+
 
-In addition, aiter backend can be built during apex installation by providing --aiter flag
+
+Aiter backend can be built and used for fused rope. To install aiter:
 ```
-# if pip >= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key...
-pip install -v --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" --config-settings "--build-option=--aiter" ./
-# otherwise
-python setup.py install --cpp_ext --cuda_ext --aiter
+make aiter
 ```
 
 To use aiter in fused rope, you can use the flag ```USE_ROCM_AITER_ROPE_BACKEND=1```.
 
+### To create a wheel and then install apex using the wheel, use the following command in apex folder:
+```
+python -m build --wheel --no-isolation (can use the same environment variables to build specific extensions, cpp extensions and cuda extensions)
+pip install dist/apex-*.whl​
+```
+
+### To uninstall apex and its extensions, use the following command in apex folder:
+```
+pip uninstall apex
+make clean
+```
+
 ### Enable hipblasLT on ROCm
 hipblasLT is supported only on mi300 (gfx942) only.  
 python setup.py automatically builds apex with hipblasLT support only if GPU device id is gfx942  
@@ -173,33 +217,22 @@ CUDA and C++ extensions via
 ```bash
 git clone https://github.com/rocm/apex
 cd apex
-# if pip >= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key...
-pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
-# otherwise
-pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --global-option="--cpp_ext" --global-option="--cuda_ext" ./
-```
-
-Apex also supports a Python-only build via
-```bash
-pip install -v --disable-pip-version-check --no-build-isolation --no-cache-dir ./
+pip install . --no-build-isolation
 ```
-A Python-only build omits:
-- Fused kernels required to use `apex.optimizers.FusedAdam`.
-- Fused kernels required to use `apex.normalization.FusedLayerNorm` and `apex.normalization.FusedRMSNorm`.
-- Fused kernels that improve the performance and numerical stability of `apex.parallel.SyncBatchNorm`.
-- Fused kernels that improve the performance of `apex.parallel.DistributedDataParallel` and `apex.amp`.
-`DistributedDataParallel`, `amp`, and `SyncBatchNorm` will still be usable, but they may be slower.
 
 
 ### [Experimental] Windows
-`pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .` may work if you were able to build Pytorch from source
-on your system. A Python-only build via `pip install -v --no-cache-dir .` is more likely to work.  
+`pip install . --no-build-isolation` may work if you were able to build Pytorch from source
+on your system. A Python-only build via `pip install --no-build-isolation -v --no-cache-dir .` is more likely to work.  
 If you installed Pytorch in a Conda environment, make sure to install Apex in that same environment.
 
-
 # Release notes
 
-# Release notes
+## release/1.10.0
+
+Build and installation related
+- Support JIT (just-in-time) load cpp and CUDA extensions
+
 ## release/1.9.0
 
 - No new features were added in this release cycle.
diff --git a/apex/contrib/test/run_rocm_extensions.py b/apex/contrib/test/run_rocm_extensions.py
index c7801988b..1c9add5d8 100644
--- a/apex/contrib/test/run_rocm_extensions.py
+++ b/apex/contrib/test/run_rocm_extensions.py
@@ -2,25 +2,27 @@
 import sys
 
 
-test_dirs = ["groupbn", "fused_dense", "layer_norm", "multihead_attn", "transducer", "focal_loss", "index_mul_2d", "."] # "." for test_label_smoothing.py
+test_dirs = ["groupbn", "layer_norm", "multihead_attn", "transducer", "focal_loss", "index_mul_2d", ".", \
+              "optimizers", "clip_grad"] # "." for test_label_smoothing.py
 ROCM_BLACKLIST = [
     "layer_norm"
 ]
 
-runner = unittest.TextTestRunner(verbosity=2)
+if __name__ ==  '__main__':
+    runner = unittest.TextTestRunner(verbosity=2)
 
-errcode = 0
+    errcode = 0
 
-for test_dir in test_dirs:
-    if test_dir in ROCM_BLACKLIST:
-        continue
-    suite = unittest.TestLoader().discover(test_dir)
+    for test_dir in test_dirs:
+        if test_dir in ROCM_BLACKLIST:
+            continue
+        suite = unittest.TestLoader().discover(test_dir)
 
-    print("\nExecuting tests from " + test_dir)
+        print("\nExecuting tests from " + test_dir)
 
-    result = runner.run(suite)
+        result = runner.run(suite)
 
-    if not result.wasSuccessful():
-        errcode = 1
+        if not result.wasSuccessful():
+            errcode = 1
 
-sys.exit(errcode)
+    sys.exit(errcode)
diff --git a/apex/csrc b/apex/csrc
new file mode 120000
index 000000000..e96d28eb5
--- /dev/null
+++ b/apex/csrc
@@ -0,0 +1 @@
+../csrc
\ No newline at end of file
diff --git a/apex/git_version_info.py b/apex/git_version_info.py
new file mode 100644
index 000000000..ee9e7c6c7
--- /dev/null
+++ b/apex/git_version_info.py
@@ -0,0 +1,34 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Portions of this code were adapted from DeepSpeed:
+# https://github.com/microsoft/DeepSpeed
+# Modified for ROCm Apex
+
+try:
+    #  This is populated by setup.py
+    from .git_version_info_installed import *  # noqa: F401 # type: ignore
+except ModuleNotFoundError:
+    import os
+    if os.path.isfile('version.txt'):
+        # Will be missing from checkouts that haven't been installed (e.g., readthedocs)
+        version = open('version.txt', 'r').read().strip()
+    else:
+        version = "0.0.0"
+    git_hash = '[none]'
+    git_branch = '[none]'
+
+    from .op_builder.all_ops import ALL_OPS
+    installed_ops = dict.fromkeys(ALL_OPS.keys(), False)
+    torch_info = {'version': "0.0", "cuda_version": "0.0", "hip_version": "0.0"}
+
+# compatible_ops list is recreated for each launch
+from .op_builder.all_ops import ALL_OPS
+
+compatible_ops = dict.fromkeys(ALL_OPS.keys(), False)
+for op_name, builder in ALL_OPS.items():
+    op_compatible = builder.is_compatible()
+    compatible_ops[op_name] = op_compatible
+    compatible_ops["apex_not_implemented"] = False
\ No newline at end of file
diff --git a/apex/op_builder b/apex/op_builder
new file mode 120000
index 000000000..1e19f3e8d
--- /dev/null
+++ b/apex/op_builder
@@ -0,0 +1 @@
+../op_builder
\ No newline at end of file
diff --git a/compatibility/__init__.py b/compatibility/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/compatibility/_apex_nccl_allocator.py b/compatibility/_apex_nccl_allocator.py
new file mode 100644
index 000000000..6a029d1ee
--- /dev/null
+++ b/compatibility/_apex_nccl_allocator.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _ApexNcclAllocatorModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'NCCLAllocatorBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load _apex_nccl_allocator : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_") and name != "__class__":
+            raise AttributeError(f"module _apex_nccl_allocator has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module '_apex_nccl_allocator'>"
+
+sys.modules[__name__] = _ApexNcclAllocatorModule()
\ No newline at end of file
diff --git a/compatibility/amp_C.py b/compatibility/amp_C.py
new file mode 100644
index 000000000..f9257c596
--- /dev/null
+++ b/compatibility/amp_C.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _AmpCModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'AmpCBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load amp_C : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module amp_C has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'amp_C'>"
+
+sys.modules[__name__] = _AmpCModule()
\ No newline at end of file
diff --git a/compatibility/apex_C.py b/compatibility/apex_C.py
new file mode 100644
index 000000000..39bac5264
--- /dev/null
+++ b/compatibility/apex_C.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _ApexCModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'ApexCBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load apex_C : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module apex_C has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'apex_C'>"
+
+sys.modules[__name__] = _ApexCModule()
\ No newline at end of file
diff --git a/compatibility/bnp.py b/compatibility/bnp.py
new file mode 100644
index 000000000..b03ba798c
--- /dev/null
+++ b/compatibility/bnp.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _BnpModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'BnpBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load bnp : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module bnp has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'bnp'>"
+
+sys.modules[__name__] = _BnpModule()
\ No newline at end of file
diff --git a/compatibility/distributed_adam_cuda.py b/compatibility/distributed_adam_cuda.py
new file mode 100644
index 000000000..2566dce11
--- /dev/null
+++ b/compatibility/distributed_adam_cuda.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _DistributedAdamCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'DistributedAdamBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load distributed_adam_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module distributed_adam_cuda has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'distributed_adam_cuda'>"
+
+sys.modules[__name__] = _DistributedAdamCudaModule()
\ No newline at end of file
diff --git a/compatibility/distributed_lamb_cuda.py b/compatibility/distributed_lamb_cuda.py
new file mode 100644
index 000000000..7f0b64f3e
--- /dev/null
+++ b/compatibility/distributed_lamb_cuda.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _DistributedLambCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'DistributedLambBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load distributed_lamb_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module distributed_lamb_cuda has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'distributed_lamb_cuda'>"
+
+sys.modules[__name__] = _DistributedLambCudaModule()
\ No newline at end of file
diff --git a/compatibility/fast_multihead_attn.py b/compatibility/fast_multihead_attn.py
new file mode 100644
index 000000000..a9e060b87
--- /dev/null
+++ b/compatibility/fast_multihead_attn.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _FastMultiheadAttnModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'FastMultiheadAttnBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load fast_multihead_attn : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module fast_multihead_attn has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'fast_multihead_attn'>"
+
+sys.modules[__name__] = _FastMultiheadAttnModule()
\ No newline at end of file
diff --git a/compatibility/focal_loss_cuda.py b/compatibility/focal_loss_cuda.py
new file mode 100644
index 000000000..c7b364faf
--- /dev/null
+++ b/compatibility/focal_loss_cuda.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _FocalLossCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'FocalLossBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load focal_loss_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module focal_loss_cuda has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'focal_loss_cuda'>"
+
+sys.modules[__name__] = _FocalLossCudaModule()
\ No newline at end of file
diff --git a/compatibility/fused_adam_cuda.py b/compatibility/fused_adam_cuda.py
new file mode 100644
index 000000000..bf31ca739
--- /dev/null
+++ b/compatibility/fused_adam_cuda.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _FusedAdamCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'FusedAdamBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load fused_adam_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module fused_adam_cuda has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'fused_adam_cuda'>"
+
+sys.modules[__name__] = _FusedAdamCudaModule()
\ No newline at end of file
diff --git a/compatibility/fused_bias_swiglu.py b/compatibility/fused_bias_swiglu.py
new file mode 100644
index 000000000..e9f066f4a
--- /dev/null
+++ b/compatibility/fused_bias_swiglu.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _FusedBiasSwiGLUModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'FusedBiasSwiGLUBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load fused_bias_swiglu : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module fused_bias_swiglu has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'fused_bias_swiglu'>"
+
+sys.modules[__name__] = _FusedBiasSwiGLUModule()
\ No newline at end of file
diff --git a/compatibility/fused_dense_cuda.py b/compatibility/fused_dense_cuda.py
new file mode 100644
index 000000000..0d28badb2
--- /dev/null
+++ b/compatibility/fused_dense_cuda.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _FusedDenseCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'FusedDenseBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load fused_dense_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module fused_dense_cuda has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'fused_dense_cuda'>"
+
+sys.modules[__name__] = _FusedDenseCudaModule()
\ No newline at end of file
diff --git a/compatibility/fused_index_mul_2d.py b/compatibility/fused_index_mul_2d.py
new file mode 100644
index 000000000..c036877df
--- /dev/null
+++ b/compatibility/fused_index_mul_2d.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _FusedIndexMul2dModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'FusedIndexMul2dBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load fused_index_mul_2d : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module fused_index_mul_2d has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'fused_index_mul_2d'>"
+
+sys.modules[__name__] = _FusedIndexMul2dModule()
\ No newline at end of file
diff --git a/compatibility/fused_lamb_cuda.py b/compatibility/fused_lamb_cuda.py
new file mode 100644
index 000000000..3ab88d443
--- /dev/null
+++ b/compatibility/fused_lamb_cuda.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _FusedLambCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'FusedLambBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load fused_lamb_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module fused_lamb_cuda has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'fused_lamb_cuda'>"
+
+sys.modules[__name__] = _FusedLambCudaModule()
\ No newline at end of file
diff --git a/compatibility/fused_layer_norm_cuda.py b/compatibility/fused_layer_norm_cuda.py
new file mode 100644
index 000000000..2722e0252
--- /dev/null
+++ b/compatibility/fused_layer_norm_cuda.py
@@ -0,0 +1,44 @@
+import sys
+import importlib
+
+class _FusedLayerCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                #import the builder
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                mlp_builder = getattr(apex_op_builder, 'FusedLayerNormBuilder')
+
+                #load the module
+                self._loaded_module = mlp_builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load fused_layer_norm_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+    
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module fused_layer_norm_cuda has no attribute '{name}'")
+        
+        module = self._load_module()
+        return getattr(module, name)
+
+    def __dir__(self):
+        try:
+            module = self._load_module()
+            return dir(module)
+        except:
+            return []
+        
+    def __repr__(self):
+        return "<module 'fused_layer_norm_cuda'>"
+    
+#replace module with lazy loader
+sys.modules[__name__] = _FusedLayerCudaModule()
\ No newline at end of file
diff --git a/compatibility/fused_rotary_positional_embedding.py b/compatibility/fused_rotary_positional_embedding.py
new file mode 100644
index 000000000..d4f87bd33
--- /dev/null
+++ b/compatibility/fused_rotary_positional_embedding.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _FusedRotaryPositionalEmbeddingModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'FusedRopeBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load fused_rotary_positional_embedding : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module fused_rotary_positional_embedding has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'fused_rotary_positional_embedding'>"
+
+sys.modules[__name__] = _FusedRotaryPositionalEmbeddingModule()
\ No newline at end of file
diff --git a/compatibility/fused_weight_gradient_mlp_cuda.py b/compatibility/fused_weight_gradient_mlp_cuda.py
new file mode 100644
index 000000000..219d9355b
--- /dev/null
+++ b/compatibility/fused_weight_gradient_mlp_cuda.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _FusedWeightGradientMlpCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'FusedWeightGradientMlpCudaBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load fused_weight_gradient_mlp_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module fused_weight_gradient_mlp_cuda has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'fused_weight_gradient_mlp_cuda'>"
+
+sys.modules[__name__] = _FusedWeightGradientMlpCudaModule()
\ No newline at end of file
diff --git a/compatibility/generic_scaled_masked_softmax_cuda.py b/compatibility/generic_scaled_masked_softmax_cuda.py
new file mode 100644
index 000000000..fa50ca52c
--- /dev/null
+++ b/compatibility/generic_scaled_masked_softmax_cuda.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _GenericScaledMaskedSoftmaxCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'GenericScaledMaskedSoftmaxCudaBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load generic_scaled_masked_softmax_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module generic_scaled_masked_softmax_cuda has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'generic_scaled_masked_softmax_cuda'>"
+
+sys.modules[__name__] = _GenericScaledMaskedSoftmaxCudaModule()
\ No newline at end of file
diff --git a/compatibility/mlp_cuda.py b/compatibility/mlp_cuda.py
new file mode 100644
index 000000000..4c873d560
--- /dev/null
+++ b/compatibility/mlp_cuda.py
@@ -0,0 +1,44 @@
+import sys
+import importlib
+
+class _MLPCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                #import the builder
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                mlp_builder = getattr(apex_op_builder, 'MlpBuilder')
+
+                #load the module
+                self._loaded_module = mlp_builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load mlp_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+    
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module mlp_cuda has no attribute '{name}'")
+        
+        module = self._load_module()
+        return getattr(module, name)
+
+    def __dir__(self):
+        try:
+            module = self._load_module()
+            return dir(module)
+        except:
+            return []
+        
+    def __repr__(self):
+        return "<module 'mlp_cuda'>"
+    
+#replace module with lazy loader
+sys.modules[__name__] = _MLPCudaModule()
\ No newline at end of file
diff --git a/compatibility/nccl_p2p_cuda.py b/compatibility/nccl_p2p_cuda.py
new file mode 100644
index 000000000..d937cb95e
--- /dev/null
+++ b/compatibility/nccl_p2p_cuda.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _NcclP2pCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'NCCLP2PBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load nccl_p2p_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module nccl_p2p_cuda has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'nccl_p2p_cuda'>"
+
+sys.modules[__name__] = _NcclP2pCudaModule()
\ No newline at end of file
diff --git a/compatibility/peer_memory_cuda.py b/compatibility/peer_memory_cuda.py
new file mode 100644
index 000000000..d909ec1b9
--- /dev/null
+++ b/compatibility/peer_memory_cuda.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _PeerMemoryCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'PeerMemoryBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load peer_memory_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module peer_memory_cuda has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'peer_memory_cuda'>"
+
+sys.modules[__name__] = _PeerMemoryCudaModule()
\ No newline at end of file
diff --git a/compatibility/scaled_masked_softmax_cuda.py b/compatibility/scaled_masked_softmax_cuda.py
new file mode 100644
index 000000000..77ed74e47
--- /dev/null
+++ b/compatibility/scaled_masked_softmax_cuda.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _ScaledMaskedSoftmaxCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'ScaledMaskedSoftmaxCudaBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load scaled_masked_softmax_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module scaled_masked_softmax_cuda has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'scaled_masked_softmax_cuda'>"
+
+sys.modules[__name__] = _ScaledMaskedSoftmaxCudaModule()
\ No newline at end of file
diff --git a/compatibility/scaled_softmax_cuda.py b/compatibility/scaled_softmax_cuda.py
new file mode 100644
index 000000000..d7a4427e3
--- /dev/null
+++ b/compatibility/scaled_softmax_cuda.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _ScaledSoftmaxCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'ScaledSoftmaxCudaBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load scaled_softmax_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module scaled_softmax_cuda has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'scaled_softmax_cuda'>"
+
+sys.modules[__name__] = _ScaledSoftmaxCudaModule()
\ No newline at end of file
diff --git a/compatibility/scaled_upper_triang_masked_softmax_cuda.py b/compatibility/scaled_upper_triang_masked_softmax_cuda.py
new file mode 100644
index 000000000..8da9b5c67
--- /dev/null
+++ b/compatibility/scaled_upper_triang_masked_softmax_cuda.py
@@ -0,0 +1,38 @@
+import sys
+import importlib
+
+class _ScaledUpperTriangMaskedSoftmaxCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                name = 'ScaledUpperTriangMaskedSoftmaxCudaBuilder'
+                builder = getattr(apex_op_builder, name)
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load scaled_upper_triang_masked_softmax_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name_attr):
+        if name_attr.startswith("_"):
+            raise AttributeError(f"module scaled_upper_triang_masked_softmax_cuda has no attribute '{name_attr}'")
+        return getattr(self._load_module(), name_attr)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'scaled_upper_triang_masked_softmax_cuda'>"
+
+sys.modules[__name__] = _ScaledUpperTriangMaskedSoftmaxCudaModule()
\ No newline at end of file
diff --git a/compatibility/syncbn.py b/compatibility/syncbn.py
new file mode 100644
index 000000000..b619575dc
--- /dev/null
+++ b/compatibility/syncbn.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _SyncbnModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'SyncBnBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load syncbn : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module syncbn has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'syncbn'>"
+
+sys.modules[__name__] = _SyncbnModule()
\ No newline at end of file
diff --git a/compatibility/transducer_joint_cuda.py b/compatibility/transducer_joint_cuda.py
new file mode 100644
index 000000000..e06705fde
--- /dev/null
+++ b/compatibility/transducer_joint_cuda.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _TransducerJointCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'TransducerJointBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load transducer_joint_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module transducer_joint_cuda has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'transducer_joint_cuda'>"
+
+sys.modules[__name__] = _TransducerJointCudaModule()
\ No newline at end of file
diff --git a/compatibility/transducer_loss_cuda.py b/compatibility/transducer_loss_cuda.py
new file mode 100644
index 000000000..d5a2c0f36
--- /dev/null
+++ b/compatibility/transducer_loss_cuda.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _TransducerLossCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'TransducerLossBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load transducer_loss_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module transducer_loss_cuda has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'transducer_loss_cuda'>"
+
+sys.modules[__name__] = _TransducerLossCudaModule()
\ No newline at end of file
diff --git a/compatibility/xentropy_cuda.py b/compatibility/xentropy_cuda.py
new file mode 100644
index 000000000..ff4dc9733
--- /dev/null
+++ b/compatibility/xentropy_cuda.py
@@ -0,0 +1,37 @@
+import sys
+import importlib
+
+class _XentropyCudaModule:
+    def __init__(self):
+        self._loaded_module = None
+        self._loading = False
+
+    def _load_module(self):
+        if self._loaded_module is None and not self._loading:
+            self._loading = True
+            try:
+                apex_op_builder = importlib.import_module('apex.op_builder')
+                builder = getattr(apex_op_builder, 'XentropyBuilder')
+                self._loaded_module = builder().load()
+            except Exception as e:
+                self._loading = False
+                raise ImportError(f"Failed to load xentropy_cuda : {e}")
+            finally:
+                self._loading = False
+        return self._loaded_module
+
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"module xentropy_cuda has no attribute '{name}'")
+        return getattr(self._load_module(), name)
+
+    def __dir__(self):
+        try:
+            return dir(self._load_module())
+        except:
+            return []
+
+    def __repr__(self):
+        return "<module 'xentropy_cuda'>"
+
+sys.modules[__name__] = _XentropyCudaModule()
\ No newline at end of file
diff --git a/contrib/csrc b/contrib/csrc
new file mode 120000
index 000000000..4e941d8b2
--- /dev/null
+++ b/contrib/csrc
@@ -0,0 +1 @@
+../apex/contrib/csrc
\ No newline at end of file
diff --git a/op_builder/__init__.py b/op_builder/__init__.py
new file mode 100644
index 000000000..726ec6f4d
--- /dev/null
+++ b/op_builder/__init__.py
@@ -0,0 +1,56 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Portions of this code were adapted from DeepSpeed:
+# https://github.com/microsoft/DeepSpeed
+# Modified for ROCm Apex
+
+import sys
+import os
+import pkgutil
+import importlib
+
+from .builder import get_default_compute_capabilities, OpBuilder
+
+__apex__ = True
+
+# List of all available op builders from apex op_builder
+try:
+    import apex.op_builder  # noqa: F401 # type: ignore
+    op_builder_dir = "apex.op_builder"
+except ImportError:
+    op_builder_dir = "op_builder"
+
+__op_builders__ = []
+
+this_module = sys.modules[__name__]
+
+
+def builder_closure(member_name):
+    if op_builder_dir == "op_builder":
+        # during installation time cannot get builder due to torch not installed,
+        # return closure instead
+        def _builder():
+            from apex.op_builder.all_ops import BuilderUtils
+            builder = BuilderUtils().create_op_builder(member_name)
+            return builder
+
+        return _builder
+    else:
+        # during runtime, return op builder class directly
+        from apex.op_builder.all_ops import BuilderUtils
+        builder = BuilderUtils().get_op_builder(member_name)
+        return builder
+
+# this is for the import statement such as 'from apex.op_builder import FusedLayerNormBuilder' to work
+# reflect builder names and add builder closure, such as 'apex.op_builder.FusedLayerNormBuilder()' creates op builder 
+for _, module_name, _ in pkgutil.iter_modules([os.path.dirname(this_module.__file__)]):
+    if module_name != 'all_ops' and module_name != 'builder':
+        module = importlib.import_module(f".{module_name}", package=op_builder_dir)
+        for member_name in module.__dir__():
+            if member_name.endswith('Builder') and member_name != "OpBuilder" and member_name != "CUDAOpBuilder" and member_name != "CPUOpBuilder":
+                # assign builder name to variable with same name
+                # the following is equivalent to i.e. TransformerBuilder = "TransformerBuilder"
+                this_module.__dict__[member_name] = builder_closure(member_name)
\ No newline at end of file
diff --git a/op_builder/all_ops.py b/op_builder/all_ops.py
new file mode 100644
index 000000000..e18dbdd71
--- /dev/null
+++ b/op_builder/all_ops.py
@@ -0,0 +1,87 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Portions of this code were adapted from DeepSpeed:
+# https://github.com/microsoft/DeepSpeed
+# Modified for ROCm Apex
+
+import os
+import pkgutil
+import importlib
+
+class BuilderUtils:
+    def op_builder_dir(self):
+        try:
+            # is op_builder from apex or a 3p version? this should only succeed if it's apex
+            # if successful this also means we're doing a local install and not JIT compile path
+            from op_builder import __apex__
+            return "op_builder"
+        except ImportError:
+            return "apex.op_builder"
+
+    # dict that holds class name <--> class type mapping i.e.
+    # 'AsyncIOBuilder': <class 'op_builder.async_io.AsyncIOBuilder'>
+    # this dict will be filled at init stage
+    class_dict = None
+
+    def _lazy_init_class_dict(self):
+        if self.class_dict is not None:
+            return
+        else:
+            self.class_dict = {}
+            # begin initialize for create_op_builder()
+            # put all valid class name <--> class type mapping into class_dict
+            op_builder_dir = self.op_builder_dir()
+            op_builder_module = importlib.import_module(op_builder_dir)
+            op_builder_absolute_path = os.path.dirname(op_builder_module.__file__)
+            for _, module_name, _ in pkgutil.iter_modules([op_builder_absolute_path]):
+                # avoid self references,
+                # skip sub_directories which contains ops for other backend(cpu, npu, etc.).
+                if module_name != 'all_ops' and module_name != 'builder' and not os.path.isdir(
+                        os.path.join(op_builder_absolute_path, module_name)):
+                    module = importlib.import_module("{}.{}".format(op_builder_dir, module_name))
+                    for member_name in module.__dir__():
+                        if member_name.endswith(
+                                'Builder'
+                        ) and member_name != "OpBuilder" and member_name != "CUDAOpBuilder" and member_name != "CPUOpBuilder":  # avoid abstract classes
+                            if not member_name in self.class_dict:
+                                self.class_dict[member_name] = getattr(module, member_name)
+            # end initialize for create_op_builder()
+
+    # create an instance of op builder and return, name specified by class_name
+    def create_op_builder(self, class_name):
+        self._lazy_init_class_dict()
+        if class_name in self.class_dict:
+            return self.class_dict[class_name]()
+        else:
+            return None
+
+    # return an op builder class, name specified by class_name
+    def get_op_builder(self, class_name):
+        self._lazy_init_class_dict()
+        if class_name in self.class_dict:
+            return self.class_dict[class_name]
+        else:
+            return None
+
+# List of all available ops
+
+# append all builder names into __op_builders__
+builder_utils = BuilderUtils()
+op_builder_dir = builder_utils.op_builder_dir()
+op_builder_module = importlib.import_module(op_builder_dir)
+__op_builders__ = []
+
+for _, module_name, _ in pkgutil.iter_modules([os.path.dirname(op_builder_module.__file__)]):
+    # avoid self references
+    if module_name != 'all_ops' and module_name != 'builder':
+        module = importlib.import_module("{}.{}".format(op_builder_dir, module_name))
+        for member_name in module.__dir__():
+            if member_name.endswith('Builder'):
+                # append builder to __op_builders__ list
+                builder = builder_utils.create_op_builder(member_name)
+                __op_builders__.append(builder)
+                
+ALL_OPS = {op.name: op for op in __op_builders__ if op is not None}
\ No newline at end of file
diff --git a/op_builder/amp_C.py b/op_builder/amp_C.py
new file mode 100644
index 000000000..41f029fcb
--- /dev/null
+++ b/op_builder/amp_C.py
@@ -0,0 +1,45 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class AmpCBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_AMP_C'
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
+    NAME = "amp_C"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ['csrc/amp_C_frontend.cpp',
+                'csrc/multi_tensor_sgd_kernel.cu',
+                'csrc/multi_tensor_scale_kernel.cu',
+                'csrc/multi_tensor_axpby_kernel.cu',
+                'csrc/multi_tensor_l2norm_kernel.cu',
+                'csrc/multi_tensor_l2norm_kernel_mp.cu',
+                'csrc/multi_tensor_l2norm_scale_kernel.cu',
+                'csrc/multi_tensor_lamb_stage_1.cu',
+                'csrc/multi_tensor_lamb_stage_2.cu',
+                'csrc/multi_tensor_adam.cu',
+                'csrc/multi_tensor_adagrad.cu',
+                'csrc/multi_tensor_novograd.cu',
+                'csrc/multi_tensor_lars.cu',
+                'csrc/multi_tensor_lamb.cu',
+                'csrc/multi_tensor_lamb_mp.cu']
+
+    def include_paths(self):
+        return ['csrc/']
+        
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = ['-O3'] + self.version_dependent_macros()
+        if not self.is_rocm_pytorch():
+            nvcc_flags += ['-lineinfo', '--use_fast_math']
+        return nvcc_flags
\ No newline at end of file
diff --git a/op_builder/apex_C.py b/op_builder/apex_C.py
new file mode 100644
index 000000000..b02526e77
--- /dev/null
+++ b/op_builder/apex_C.py
@@ -0,0 +1,25 @@
+from .builder import CPUOpBuilder
+
+import sys
+
+
+class ApexCBuilder(CPUOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_APEX_C'
+    INCLUDE_FLAG = "APEX_BUILD_CPP_OPS"
+    NAME = "apex_C"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ["csrc/flatten_unflatten.cpp"]
+
+    def include_paths(self):
+        return ['csrc/' ]
+        
+    def libraries_args(self):
+        args = super().libraries_args()
+        return args
\ No newline at end of file
diff --git a/op_builder/bnp.py b/op_builder/bnp.py
new file mode 100644
index 000000000..f7fbe1abd
--- /dev/null
+++ b/op_builder/bnp.py
@@ -0,0 +1,33 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class BnpBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_BNP'
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
+    NAME = "bnp"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ['contrib/csrc/groupbn/batch_norm.cu',
+                'contrib/csrc/groupbn/ipc.cu',
+                'contrib/csrc/groupbn/interface.cpp',
+                'contrib/csrc/groupbn/batch_norm_add_relu.cu']
+
+    def include_paths(self):
+        return ['contrib/csrc', 'csrc']
+
+    def cxx_args(self):
+        return self.version_dependent_macros()
+
+    def nvcc_args(self):
+        return ['-DCUDA_HAS_FP16=1',
+                    '-D__CUDA_NO_HALF_OPERATORS__',
+                    '-D__CUDA_NO_HALF_CONVERSIONS__',
+                    '-D__CUDA_NO_HALF2_OPERATORS__'] + self.version_dependent_macros()
\ No newline at end of file
diff --git a/op_builder/builder.py b/op_builder/builder.py
new file mode 100644
index 000000000..60e490b2b
--- /dev/null
+++ b/op_builder/builder.py
@@ -0,0 +1,927 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Portions of this code were adapted from DeepSpeed:
+# https://github.com/microsoft/DeepSpeed
+# Modified for ROCm Apex
+
+import os
+import re
+import sys
+import time
+import importlib
+from pathlib import Path
+import subprocess
+import shlex
+import shutil
+import tempfile
+import distutils.ccompiler
+import distutils.log
+import distutils.sysconfig
+from distutils.errors import CompileError, LinkError
+from abc import ABC, abstractmethod
+from typing import List
+
+YELLOW = '\033[93m'
+END = '\033[0m'
+WARNING = f"{YELLOW} [WARNING] {END}"
+
+DEFAULT_TORCH_EXTENSION_PATH = "/tmp/torch_extensions"
+DEFAULT_COMPUTE_CAPABILITIES = "6.0;6.1;7.0"
+
+try:
+    import torch
+except ImportError:
+    print(f"{WARNING} unable to import torch, please install it if you want to pre-compile any apex ops.")
+else:
+    TORCH_MAJOR = int(torch.__version__.split('.')[0])
+    TORCH_MINOR = int(torch.__version__.split('.')[1])
+
+
+class MissingCUDAException(Exception):
+    pass
+
+
+class CUDAMismatchException(Exception):
+    pass
+
+
+def installed_cuda_version(name=""):
+    import torch.utils.cpp_extension
+    cuda_home = torch.utils.cpp_extension.CUDA_HOME
+    if cuda_home is None:
+        raise MissingCUDAException("CUDA_HOME does not exist, unable to compile CUDA op(s)")
+    # Ensure there is not a cuda version mismatch between torch and nvcc compiler
+    output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"], universal_newlines=True)
+    output_split = output.split()
+    release_idx = output_split.index("release")
+    release = output_split[release_idx + 1].replace(',', '').split(".")
+    # Ignore patch versions, only look at major + minor
+    cuda_major, cuda_minor = release[:2]
+    return int(cuda_major), int(cuda_minor)
+
+
+def get_default_compute_capabilities():
+    compute_caps = DEFAULT_COMPUTE_CAPABILITIES
+    # Update compute capability according to: https://en.wikipedia.org/wiki/CUDA#GPUs_supported
+    import torch.utils.cpp_extension
+    if torch.utils.cpp_extension.CUDA_HOME is not None:
+        if installed_cuda_version()[0] == 11:
+            if installed_cuda_version()[1] >= 0:
+                compute_caps += ";8.0"
+            if installed_cuda_version()[1] >= 1:
+                compute_caps += ";8.6"
+            if installed_cuda_version()[1] >= 8:
+                compute_caps += ";9.0"
+        elif installed_cuda_version()[0] == 12:
+            compute_caps += ";8.0;8.6;9.0"
+            if installed_cuda_version()[1] >= 8:
+                compute_caps += ";10.0;12.0"
+    return compute_caps
+
+
+# list compatible minor CUDA versions - so that for example pytorch built with cuda-11.0 can be used
+# to build apex and system-wide installed cuda 11.2
+cuda_minor_mismatch_ok = {
+    10: ["10.0", "10.1", "10.2"],
+    11: ["11.0", "11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8"],
+    12: ["12.0", "12.1", "12.2", "12.3", "12.4", "12.5", "12.6",
+         "12.8"],  # There does not appear to be a CUDA Toolkit 12.7
+}
+
+
+def assert_no_cuda_mismatch(name=""):
+    cuda_major, cuda_minor = installed_cuda_version(name)
+    sys_cuda_version = f'{cuda_major}.{cuda_minor}'
+    torch_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
+    # This is a show-stopping error, should probably not proceed past this
+    if sys_cuda_version != torch_cuda_version:
+        if (cuda_major in cuda_minor_mismatch_ok and sys_cuda_version in cuda_minor_mismatch_ok[cuda_major]
+                and torch_cuda_version in cuda_minor_mismatch_ok[cuda_major]):
+            print(f"Installed CUDA version {sys_cuda_version} does not match the "
+                  f"version torch was compiled with {torch.version.cuda} "
+                  "but since the APIs are compatible, accepting this combination")
+            return True
+        elif os.getenv("APEX_SKIP_CUDA_CHECK", "0") == "1":
+            print(
+                f"{WARNING} Apex Op Builder: Installed CUDA version {sys_cuda_version} does not match the "
+                f"version torch was compiled with {torch.version.cuda}."
+                "Detected `APEX_SKIP_CUDA_CHECK=1`: Allowing this combination of CUDA, but it may result in unexpected behavior."
+            )
+            return True
+        raise CUDAMismatchException(
+            f">- Apex Op Builder: Installed CUDA version {sys_cuda_version} does not match the "
+            f"version torch was compiled with {torch.version.cuda}, unable to compile "
+            "cuda/cpp extensions without a matching cuda version.")
+    return True
+
+
+class OpBuilder(ABC):
+    _rocm_version = None
+    _rocm_gpu_arch = None
+    _rocm_wavefront_size = None
+    _is_rocm_pytorch = None
+    _is_sycl_enabled = None
+    _loaded_ops = {}
+
+    def __init__(self, name):
+        self.name = name
+        self.jit_mode = False
+        self.build_for_cpu = False
+        self.enable_bf16 = False
+        self.error_log = None
+
+    @abstractmethod
+    def absolute_name(self):
+        '''
+        Returns absolute build path for cases where the op is pre-installed, e.g., apex.ops.adam.cpu_adam
+        will be installed as something like: apex/ops/adam/cpu_adam.so
+        '''
+        pass
+
+    @abstractmethod
+    def sources(self):
+        '''
+        Returns list of source files for your op, relative to root of apex package
+        '''
+        pass
+
+    def hipify_extension(self):
+        pass
+
+    def sycl_extension(self):
+        pass
+
+    @staticmethod
+    def validate_torch_version(torch_info):
+        install_torch_version = torch_info['version']
+        current_torch_version = ".".join(torch.__version__.split('.')[:2])
+        if install_torch_version != current_torch_version:
+            raise RuntimeError("PyTorch version mismatch! apex ops were compiled and installed "
+                               "with a different version than what is being used at runtime. "
+                               f"Please re-install apex or switch torch versions. "
+                               f"Install torch version={install_torch_version}, "
+                               f"Runtime torch version={current_torch_version}")
+
+    @staticmethod
+    def validate_torch_op_version(torch_info):
+        if not OpBuilder.is_rocm_pytorch():
+            current_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
+            install_cuda_version = torch_info['cuda_version']
+            if install_cuda_version != current_cuda_version:
+                raise RuntimeError("CUDA version mismatch! apex ops were compiled and installed "
+                                   "with a different version than what is being used at runtime. "
+                                   f"Please re-install apex or switch torch versions. "
+                                   f"Install CUDA version={install_cuda_version}, "
+                                   f"Runtime CUDA version={current_cuda_version}")
+        else:
+            current_hip_version = ".".join(torch.version.hip.split('.')[:2])
+            install_hip_version = torch_info['hip_version']
+            if install_hip_version != current_hip_version:
+                raise RuntimeError("HIP version mismatch! apex ops were compiled and installed "
+                                   "with a different version than what is being used at runtime. "
+                                   f"Please re-install apex or switch torch versions. "
+                                   f"Install HIP version={install_hip_version}, "
+                                   f"Runtime HIP version={current_hip_version}")
+
+    @staticmethod
+    def is_rocm_pytorch():
+        if OpBuilder._is_rocm_pytorch is not None:
+            return OpBuilder._is_rocm_pytorch
+
+        _is_rocm_pytorch = False
+        try:
+            import torch
+        except ImportError:
+            pass
+        else:
+            if TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 5):
+                _is_rocm_pytorch = hasattr(torch.version, 'hip') and torch.version.hip is not None
+                if _is_rocm_pytorch:
+                    from torch.utils.cpp_extension import ROCM_HOME
+                    _is_rocm_pytorch = ROCM_HOME is not None
+        OpBuilder._is_rocm_pytorch = _is_rocm_pytorch
+        return OpBuilder._is_rocm_pytorch
+
+    @staticmethod
+    def is_sycl_enabled():
+        if OpBuilder._is_sycl_enabled is not None:
+            return OpBuilder._is_sycl_enabled
+
+        _is_sycl_enabled = False
+        try:
+            result = subprocess.run(["c2s", "--version"], capture_output=True)
+        except:
+            pass
+        else:
+            _is_sycl_enabled = True
+
+        OpBuilder._is_sycl_enabled = _is_sycl_enabled
+        return OpBuilder._is_sycl_enabled
+
+    @staticmethod
+    def installed_rocm_version():
+        if OpBuilder._rocm_version:
+            return OpBuilder._rocm_version
+
+        ROCM_MAJOR = '0'
+        ROCM_MINOR = '0'
+        ROCM_VERSION_DEV_RAW = ""
+        if OpBuilder.is_rocm_pytorch():
+            from torch.utils.cpp_extension import ROCM_HOME
+            rocm_ver_file = Path(ROCM_HOME).joinpath(".info/version")
+            if rocm_ver_file.is_file():
+                with open(rocm_ver_file, 'r') as file:
+                    ROCM_VERSION_DEV_RAW = file.read()
+            elif "rocm" in torch.__version__:
+                ROCM_VERSION_DEV_RAW = torch.__version__.split("rocm")[1]
+            if ROCM_VERSION_DEV_RAW != "":
+                ROCM_MAJOR = ROCM_VERSION_DEV_RAW.split('.')[0]
+                ROCM_MINOR = ROCM_VERSION_DEV_RAW.split('.')[1]
+            else:
+                # Look in /usr/include/rocm-version.h
+                rocm_ver_file = Path("/usr/include/rocm_version.h")
+                if rocm_ver_file.is_file():
+                    with open(rocm_ver_file, 'r') as file:
+                        for ln in file.readlines():
+                            if "#define ROCM_VERSION_MAJOR" in ln:
+                                ROCM_MAJOR = re.findall(r'\S+', ln)[2]
+                            elif "#define ROCM_VERSION_MINOR" in ln:
+                                ROCM_MINOR = re.findall(r'\S+', ln)[2]
+            if ROCM_MAJOR == '0':
+                assert False, "Could not detect ROCm version"
+
+        OpBuilder._rocm_version = (int(ROCM_MAJOR), int(ROCM_MINOR))
+        return OpBuilder._rocm_version
+
+    @staticmethod
+    def get_rocm_gpu_arch():
+        if OpBuilder._rocm_gpu_arch:
+            return OpBuilder._rocm_gpu_arch
+        rocm_info = Path("/opt/rocm/bin/rocminfo")
+        if (not rocm_info.is_file()):
+            rocm_info = Path("rocminfo")
+        rocm_gpu_arch_cmd = str(rocm_info) + " | grep -o -m 1 'gfx.*'"
+        try:
+            result = subprocess.check_output(rocm_gpu_arch_cmd, shell=True)
+            rocm_gpu_arch = result.decode('utf-8').strip()
+        except subprocess.CalledProcessError:
+            rocm_gpu_arch = ""
+        OpBuilder._rocm_gpu_arch = rocm_gpu_arch
+        return OpBuilder._rocm_gpu_arch
+
+    @staticmethod
+    def get_rocm_wavefront_size():
+        if OpBuilder._rocm_wavefront_size:
+            return OpBuilder._rocm_wavefront_size
+
+        rocm_info = Path("/opt/rocm/bin/rocminfo")
+        if (not rocm_info.is_file()):
+            rocm_info = Path("rocminfo")
+        rocm_wavefront_size_cmd = str(
+            rocm_info) + " | grep -Eo -m1 'Wavefront Size:[[:space:]]+[0-9]+' | grep -Eo '[0-9]+'"
+        try:
+            result = subprocess.check_output(rocm_wavefront_size_cmd, shell=True)
+            rocm_wavefront_size = result.decode('utf-8').strip()
+        except subprocess.CalledProcessError:
+            rocm_wavefront_size = "32"
+        OpBuilder._rocm_wavefront_size = rocm_wavefront_size
+        return OpBuilder._rocm_wavefront_size
+
+    def include_paths(self):
+        '''
+        Returns list of include paths, relative to root of apex package
+        '''
+        return []
+
+    def nvcc_args(self):
+        '''
+        Returns optional list of compiler flags to forward to nvcc when building CUDA sources
+        '''
+        return []
+
+    def cxx_args(self):
+        '''
+        Returns optional list of compiler flags to forward to the build
+        '''
+        return []
+
+    def is_compatible(self, verbose=False):
+        '''
+        Check if all non-python dependencies are satisfied to build this op
+        '''
+        return True
+
+    def extra_ldflags(self):
+        return []
+
+    def has_function(self, funcname, libraries, library_dirs=None, verbose=False):
+        '''
+        Test for existence of a function within a tuple of libraries.
+
+        This is used as a smoke test to check whether a certain library is available.
+        As a test, this creates a simple C program that calls the specified function,
+        and then distutils is used to compile that program and link it with the specified libraries.
+        Returns True if both the compile and link are successful, False otherwise.
+        '''
+        tempdir = None  # we create a temporary directory to hold various files
+        filestderr = None  # handle to open file to which we redirect stderr
+        oldstderr = None  # file descriptor for stderr
+        try:
+            # Echo compile and link commands that are used.
+            if verbose:
+                distutils.log.set_verbosity(1)
+
+            # Create a compiler object.
+            compiler = distutils.ccompiler.new_compiler(verbose=verbose)
+
+            # Configure compiler and linker to build according to Python install.
+            distutils.sysconfig.customize_compiler(compiler)
+
+            # Create a temporary directory to hold test files.
+            tempdir = tempfile.mkdtemp()
+
+            # Define a simple C program that calls the function in question
+            prog = "void %s(void); int main(int argc, char** argv) { %s(); return 0; }" % (funcname, funcname)
+
+            # Write the test program to a file.
+            filename = os.path.join(tempdir, 'test.c')
+            with open(filename, 'w') as f:
+                f.write(prog)
+
+            # Redirect stderr file descriptor to a file to silence compile/link warnings.
+            if not verbose:
+                filestderr = open(os.path.join(tempdir, 'stderr.txt'), 'w')
+                oldstderr = os.dup(sys.stderr.fileno())
+                os.dup2(filestderr.fileno(), sys.stderr.fileno())
+
+            # Workaround for behavior in distutils.ccompiler.CCompiler.object_filenames()
+            # Otherwise, a local directory will be used instead of tempdir
+            drive, driveless_filename = os.path.splitdrive(filename)
+            root_dir = driveless_filename[0] if os.path.isabs(driveless_filename) else ''
+            output_dir = os.path.join(drive, root_dir)
+
+            # Attempt to compile the C program into an object file.
+            cflags = shlex.split(os.environ.get('CFLAGS', ""))
+            objs = compiler.compile([filename], output_dir=output_dir, extra_preargs=self.strip_empty_entries(cflags))
+
+            # Attempt to link the object file into an executable.
+            # Be sure to tack on any libraries that have been specified.
+            ldflags = shlex.split(os.environ.get('LDFLAGS', ""))
+            compiler.link_executable(objs,
+                                     os.path.join(tempdir, 'a.out'),
+                                     extra_preargs=self.strip_empty_entries(ldflags),
+                                     libraries=libraries,
+                                     library_dirs=library_dirs)
+
+            # Compile and link succeeded
+            return True
+
+        except CompileError:
+            return False
+
+        except LinkError:
+            return False
+
+        except:
+            return False
+
+        finally:
+            # Restore stderr file descriptor and close the stderr redirect file.
+            if oldstderr is not None:
+                os.dup2(oldstderr, sys.stderr.fileno())
+            if filestderr is not None:
+                filestderr.close()
+
+            # Delete the temporary directory holding the test program and stderr files.
+            if tempdir is not None:
+                shutil.rmtree(tempdir)
+
+    def strip_empty_entries(self, args):
+        '''
+        Drop any empty strings from the list of compile and link flags
+        '''
+        return [x for x in args if len(x) > 0]
+
+    def cpu_arch(self):
+        try:
+            from cpuinfo import get_cpu_info
+        except ImportError as e:
+            cpu_info = self._backup_cpuinfo()
+            if cpu_info is None:
+                return "-march=native"
+
+        try:
+            cpu_info = get_cpu_info()
+        except Exception as e:
+            self.warning(f"{self.name} attempted to use py-cpuinfo but failed (exception type: {type(e)}, {e}), "
+                         "falling back to lscpu to get this information.")
+            cpu_info = self._backup_cpuinfo()
+            if cpu_info is None:
+                return "-march=native"
+
+        if cpu_info['arch'].startswith('PPC_'):
+            # gcc does not provide -march on PowerPC, use -mcpu instead
+            return '-mcpu=native'
+        return '-march=native'
+
+    def get_cuda_compile_flag(self):
+        try:
+            if not self.is_rocm_pytorch():
+                assert_no_cuda_mismatch(self.name)
+                return "-D__ENABLE_CUDA__"
+        except MissingCUDAException:
+            print(f"{WARNING} {self.name} cuda is missing or is incompatible with installed torch, "
+                  "only cpu ops can be compiled!")
+            return '-D__DISABLE_CUDA__'
+        return '-D__DISABLE_CUDA__'
+
+    def _backup_cpuinfo(self):
+        # Construct cpu_info dict from lscpu that is similar to what py-cpuinfo provides
+        if not self.command_exists('lscpu'):
+            self.warning(f"{self.name} attempted to query 'lscpu' after failing to use py-cpuinfo "
+                         "to detect the CPU architecture. 'lscpu' does not appear to exist on "
+                         "your system, will fall back to use -march=native and non-vectorized execution.")
+            return None
+        result = subprocess.check_output(['lscpu'])
+        result = result.decode('utf-8').strip().lower()
+
+        cpu_info = {}
+        cpu_info['arch'] = None
+        cpu_info['flags'] = ""
+        if 'genuineintel' in result or 'authenticamd' in result:
+            cpu_info['arch'] = 'X86_64'
+            if 'avx512' in result:
+                cpu_info['flags'] += 'avx512,'
+            elif 'avx512f' in result:
+                cpu_info['flags'] += 'avx512f,'
+            if 'avx2' in result:
+                cpu_info['flags'] += 'avx2'
+        elif 'ppc64le' in result:
+            cpu_info['arch'] = "PPC_"
+
+        return cpu_info
+
+    def simd_width(self):
+        try:
+            from cpuinfo import get_cpu_info
+        except ImportError as e:
+            cpu_info = self._backup_cpuinfo()
+            if cpu_info is None:
+                return '-D__SCALAR__'
+
+        try:
+            cpu_info = get_cpu_info()
+        except Exception as e:
+            self.warning(f"{self.name} attempted to use py-cpuinfo but failed (exception type: {type(e)}, {e}), "
+                         "falling back to lscpu to get this information.")
+            cpu_info = self._backup_cpuinfo()
+            if cpu_info is None:
+                return '-D__SCALAR__'
+
+        if cpu_info['arch'] == 'X86_64':
+            if 'avx512' in cpu_info['flags'] or 'avx512f' in cpu_info['flags']:
+                return '-D__AVX512__'
+            elif 'avx2' in cpu_info['flags']:
+                return '-D__AVX256__'
+        return '-D__SCALAR__'
+
+    def command_exists(self, cmd):
+        if '|' in cmd:
+            cmds = cmd.split("|")
+        else:
+            cmds = [cmd]
+        valid = False
+        for cmd in cmds:
+            safe_cmd = ["bash", "-c", f"type {cmd}"]
+            result = subprocess.Popen(safe_cmd, stdout=subprocess.PIPE)
+            valid = valid or result.wait() == 0
+
+        if not valid and len(cmds) > 1:
+            print(f"{WARNING} {self.name} requires one of the following commands '{cmds}', but it does not exist!")
+        elif not valid and len(cmds) == 1:
+            print(f"{WARNING} {self.name} requires the '{cmd}' command, but it does not exist!")
+        return valid
+
+    def warning(self, msg):
+        self.error_log = f"{msg}"
+        print(f"{WARNING} {msg}")
+
+    def apex_src_path(self, code_path):
+        if os.path.isabs(code_path):
+            return code_path
+        else:
+            return os.path.join(Path(__file__).parent.parent.absolute(), code_path)
+
+    def builder(self):
+        from torch.utils.cpp_extension import CppExtension
+        include_dirs = [os.path.abspath(x) for x in self.strip_empty_entries(self.include_paths())]
+        return CppExtension(name=self.absolute_name(),
+                            sources=self.strip_empty_entries(self.sources()),
+                            include_dirs=include_dirs,
+                            extra_compile_args={'cxx': self.strip_empty_entries(self.cxx_args())},
+                            extra_link_args=self.strip_empty_entries(self.extra_ldflags()))
+
+    def load(self, verbose=True):
+        if self.name in __class__._loaded_ops:
+            return __class__._loaded_ops[self.name]
+
+        from apex.git_version_info import installed_ops, torch_info
+        if installed_ops.get(self.name, False):
+            # Ensure the op we're about to load was compiled with the same
+            # torch/cuda versions we are currently using at runtime.
+            self.validate_torch_version(torch_info)
+            if torch.cuda.is_available() and isinstance(self, CUDAOpBuilder):
+                self.validate_torch_op_version(torch_info)
+
+            op_module = importlib.import_module(self.absolute_name())
+            __class__._loaded_ops[self.name] = op_module
+            return op_module
+        else:
+            return self.jit_load(verbose)
+
+    def jit_load(self, verbose=True):
+        if not self.is_compatible(verbose):
+            raise RuntimeError(
+                f"Unable to JIT load the {self.name} op due to it not being compatible due to hardware/software issue. {self.error_log}"
+            )
+        try:
+            import ninja  # noqa: F401 # type: ignore
+        except ImportError:
+            raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")
+
+        if isinstance(self, CUDAOpBuilder) and not self.is_rocm_pytorch():
+            self.build_for_cpu = not torch.cuda.is_available()
+
+        self.jit_mode = True
+        from torch.utils.cpp_extension import load
+
+        start_build = time.time()
+        sources = [os.path.abspath(self.apex_src_path(path)) for path in self.sources()]
+        extra_include_paths = [os.path.abspath(self.apex_src_path(path)) for path in self.include_paths()]
+
+        # Torch will try and apply whatever CCs are in the arch list at compile time,
+        # we have already set the intended targets ourselves we know that will be
+        # needed at runtime. This prevents CC collisions such as multiple __half
+        # implementations. Stash arch list to reset after build.
+        torch_arch_list = None
+        if "TORCH_CUDA_ARCH_LIST" in os.environ:
+            torch_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST")
+            os.environ["TORCH_CUDA_ARCH_LIST"] = ""
+
+        nvcc_args = self.strip_empty_entries(self.nvcc_args())
+        cxx_args = self.strip_empty_entries(self.cxx_args())
+
+        cxx_args.append("-UC10_USE_GLOG")
+        nvcc_args.append("-UC10_USE_GLOG")
+        if isinstance(self, CUDAOpBuilder):
+            if not self.build_for_cpu and self.enable_bf16:
+                cxx_args.append("-DBF16_AVAILABLE")
+                nvcc_args.append("-DBF16_AVAILABLE")
+                nvcc_args.append("-U__CUDA_NO_BFLOAT16_OPERATORS__")
+                nvcc_args.append("-U__CUDA_NO_BFLOAT162_OPERATORS__")
+                nvcc_args.append("-U__CUDA_NO_BFLOAT16_CONVERSIONS__")
+
+        if self.is_rocm_pytorch():
+            cxx_args.append("-D__HIP_PLATFORM_AMD__=1")
+            os.environ["PYTORCH_ROCM_ARCH"] = self.get_rocm_gpu_arch()
+            cxx_args.append('-DROCM_WAVEFRONT_SIZE=%s' % self.get_rocm_wavefront_size())
+
+        op_module = load(name=self.name,
+                         sources=self.strip_empty_entries(sources),
+                         extra_include_paths=self.strip_empty_entries(extra_include_paths),
+                         extra_cflags=cxx_args,
+                         extra_cuda_cflags=nvcc_args,
+                         extra_ldflags=self.strip_empty_entries(self.extra_ldflags()),
+                         with_cuda=True if (isinstance(self, CUDAOpBuilder) and not self.build_for_cpu) else None,
+                         verbose=verbose)
+
+        build_duration = time.time() - start_build
+        if verbose:
+            print(f"Time to load {self.name} op: {build_duration} seconds")
+
+        # Reset arch list so we are not silently removing it for other possible use cases
+        if torch_arch_list:
+            os.environ["TORCH_CUDA_ARCH_LIST"] = torch_arch_list
+
+        __class__._loaded_ops[self.name] = op_module
+
+        return op_module
+
+
+class CUDAOpBuilder(OpBuilder):
+
+    def compute_capability_args(self, cross_compile_archs=None):
+        """
+        Returns nvcc compute capability compile flags.
+
+        1. `TORCH_CUDA_ARCH_LIST` takes priority over `cross_compile_archs`.
+        2. If neither is set default compute capabilities will be used
+        3. Under `jit_mode` compute capabilities of all visible cards will be used plus PTX
+
+        Format:
+
+        - `TORCH_CUDA_ARCH_LIST` may use ; or whitespace separators. Examples:
+
+        TORCH_CUDA_ARCH_LIST="6.1;7.5;8.6;9.0;10.0" pip install ...
+        TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6 9.0 10.0+PTX" pip install ...
+
+        - `cross_compile_archs` uses ; separator.
+
+        """
+        ccs = []
+        if self.jit_mode:
+            # Compile for underlying architectures since we know those at runtime
+            for i in range(torch.cuda.device_count()):
+                CC_MAJOR, CC_MINOR = torch.cuda.get_device_capability(i)
+                cc = f"{CC_MAJOR}.{CC_MINOR}"
+                if cc not in ccs:
+                    ccs.append(cc)
+            ccs = sorted(ccs)
+            ccs[-1] += '+PTX'
+        else:
+            # Cross-compile mode, compile for various architectures
+            # env override takes priority
+            cross_compile_archs_env = os.environ.get('TORCH_CUDA_ARCH_LIST', None)
+            if cross_compile_archs_env is not None:
+                if cross_compile_archs is not None:
+                    print(
+                        f"{WARNING} env var TORCH_CUDA_ARCH_LIST={cross_compile_archs_env} overrides cross_compile_archs={cross_compile_archs}"
+                    )
+                cross_compile_archs = cross_compile_archs_env.replace(' ', ';')
+            else:
+                if cross_compile_archs is None:
+                    cross_compile_archs = get_default_compute_capabilities()
+            ccs = cross_compile_archs.split(';')
+
+        ccs = self.filter_ccs(ccs)
+        if len(ccs) == 0:
+            raise RuntimeError(
+                f"Unable to load {self.name} op due to no compute capabilities remaining after filtering")
+
+        args = []
+        self.enable_bf16 = True
+        for cc in ccs:
+            num = cc[0] + cc[1].split('+')[0]
+            args.append(f'-gencode=arch=compute_{num},code=sm_{num}')
+            if cc[1].endswith('+PTX'):
+                args.append(f'-gencode=arch=compute_{num},code=compute_{num}')
+
+            if int(cc[0]) <= 7:
+                self.enable_bf16 = False
+
+        return args
+
+    def filter_ccs(self, ccs: List[str]):
+        """
+        Prune any compute capabilities that are not compatible with the builder. Should log
+        which CCs have been pruned.
+        """
+        return [cc.split('.') for cc in ccs]
+
+    def version_dependent_macros(self):
+        # Fix from apex that might be relevant for us as well, related to https://github.com/NVIDIA/apex/issues/456
+        version_ge_1_1 = []
+        if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
+            version_ge_1_1 = ['-DVERSION_GE_1_1']
+        version_ge_1_3 = []
+        if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
+            version_ge_1_3 = ['-DVERSION_GE_1_3']
+        version_ge_1_5 = []
+        if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
+            version_ge_1_5 = ['-DVERSION_GE_1_5']
+        
+        version_dependent_macro_args = version_ge_1_1 + version_ge_1_3 + version_ge_1_5
+        if self.is_rocm_pytorch() and (self.torch_version()[0] >= 6):
+            version_dependent_macro_args += ["-DHIPBLAS_V2"] 
+
+        return version_dependent_macro_args
+
+    def is_compatible(self, verbose=False):
+        return super().is_compatible(verbose)
+
+    def builder(self):
+        try:
+            if not self.is_rocm_pytorch():
+                assert_no_cuda_mismatch(self.name)
+            self.build_for_cpu = False
+        except MissingCUDAException:
+            self.build_for_cpu = True
+
+        if self.build_for_cpu:
+            from torch.utils.cpp_extension import CppExtension as ExtensionBuilder
+        else:
+            from torch.utils.cpp_extension import CUDAExtension as ExtensionBuilder
+        include_dirs = [os.path.abspath(x) for x in self.strip_empty_entries(self.include_paths())]
+        compile_args = {'cxx': self.strip_empty_entries(self.cxx_args())} if self.build_for_cpu else \
+                       {'cxx': self.strip_empty_entries(self.cxx_args()), \
+                        'nvcc': self.strip_empty_entries(self.nvcc_args())}
+
+        if not self.build_for_cpu and self.enable_bf16:
+            compile_args['cxx'].append("-DBF16_AVAILABLE")
+            compile_args['nvcc'].append("-DBF16_AVAILABLE")
+
+        if self.is_rocm_pytorch():
+            compile_args['cxx'].append("-D__HIP_PLATFORM_AMD__=1")
+            #cxx compiler args are required to compile cpp files
+            compile_args['cxx'].append('-DROCM_WAVEFRONT_SIZE=%s' % self.get_rocm_wavefront_size())
+            #nvcc compiler args are required to compile hip files
+            compile_args['nvcc'].append('-DROCM_WAVEFRONT_SIZE=%s' % self.get_rocm_wavefront_size())
+            if self.get_rocm_gpu_arch():
+                os.environ["PYTORCH_ROCM_ARCH"] = self.get_rocm_gpu_arch()
+
+        cuda_ext = ExtensionBuilder(name=self.absolute_name(),
+                                    sources=self.strip_empty_entries(self.sources()),
+                                    include_dirs=include_dirs,
+                                    libraries=self.strip_empty_entries(self.libraries_args()),
+                                    extra_compile_args=compile_args,
+                                    extra_link_args=self.strip_empty_entries(self.extra_ldflags()))
+
+        if self.is_rocm_pytorch():
+            # hip converts paths to absolute, this converts back to relative
+            sources = cuda_ext.sources
+            curr_file = Path(__file__).parent.parent  # ds root
+            for i in range(len(sources)):
+                src = Path(sources[i])
+                if src.is_absolute():
+                    sources[i] = str(src.relative_to(curr_file))
+                else:
+                    sources[i] = str(src)
+            cuda_ext.sources = sources
+        return cuda_ext
+
+    def hipify_extension(self):
+        if self.is_rocm_pytorch():
+            from torch.utils.hipify import hipify_python
+            hipify_python.hipify(
+                project_directory=os.getcwd(),
+                output_directory=os.getcwd(),
+                header_include_dirs=self.include_paths(),
+                includes=[os.path.join(os.getcwd(), '*')],
+                extra_files=[os.path.abspath(s) for s in self.sources()],
+                show_detailed=True,
+                is_pytorch_extension=True,
+                hipify_extra_files_only=True,
+            )
+
+    def cxx_args(self):
+        if sys.platform == "win32":
+            return ['-O2']
+        else:
+            return ['-O3', '-std=c++17', '-g', '-Wno-reorder']
+
+    def nvcc_args(self):
+        if self.build_for_cpu:
+            return []
+        args = ['-O3']
+        if self.is_rocm_pytorch():
+            ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
+            args += [
+                '-std=c++17', '-U__HIP_NO_HALF_OPERATORS__', '-U__HIP_NO_HALF_CONVERSIONS__',
+                '-U__HIP_NO_HALF2_OPERATORS__',
+                '-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR,
+                '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR
+            ]
+        else:
+            try:
+                nvcc_threads = int(os.getenv("APEX_NVCC_THREADS", ""))
+                if nvcc_threads <= 0:
+                    raise ValueError("")
+            except ValueError:
+                nvcc_threads = min(os.cpu_count(), 8)
+
+            cuda_major, cuda_minor = installed_cuda_version()
+            if cuda_major > 10:
+                if cuda_major == 12 and cuda_minor >= 5:
+                    std_lib = '-std=c++20'
+                else:
+                    std_lib = '-std=c++17'
+            else:
+                std_lib = '-std=c++14'
+            args += [
+                '-allow-unsupported-compiler' if sys.platform == "win32" else '', '--use_fast_math', std_lib,
+                '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__',
+                f'--threads={nvcc_threads}'
+            ]
+            if os.environ.get('APEX_DEBUG_CUDA_BUILD', '0') == '1':
+                args.append('--ptxas-options=-v')
+            args += self.compute_capability_args()
+        return args
+
+    def libraries_args(self):
+        if self.build_for_cpu:
+            return []
+
+        if sys.platform == "win32":
+            return ['cublas', 'curand']
+        else:
+            return []
+
+    def backward_pass_guard_args(self):
+        torch_dir = torch.__path__[0]
+        context_file = os.path.join(torch_dir, "include", "ATen", "Context.h")
+        if os.path.exists(context_file):
+            lines = open(context_file, 'r').readlines()
+            found_Backward_Pass_Guard = False
+            found_ROCmBackward_Pass_Guard = False
+            for line in lines:
+                if "BackwardPassGuard" in line:
+                    # BackwardPassGuard has been renamed to ROCmBackwardPassGuard
+                    # https://github.com/pytorch/pytorch/pull/71881/commits/4b82f5a67a35406ffb5691c69e6b4c9086316a43
+                    if "ROCmBackwardPassGuard" in line:
+                        found_ROCmBackward_Pass_Guard = True
+                    else:
+                        found_Backward_Pass_Guard = True
+                    break
+        backward_pass_guard_args = []
+        if found_Backward_Pass_Guard:
+            backward_pass_guard_args += ['-DBACKWARD_PASS_GUARD'] + ['-DBACKWARD_PASS_GUARD_CLASS=BackwardPassGuard']
+        if found_ROCmBackward_Pass_Guard:
+            backward_pass_guard_args += ['-DBACKWARD_PASS_GUARD'] + ['-DBACKWARD_PASS_GUARD_CLASS=ROCmBackwardPassGuard']
+        return backward_pass_guard_args
+
+    def aten_atomic_args(self):
+        torch_dir = torch.__path__[0]
+        if os.path.exists(os.path.join(torch_dir, "include", "ATen", "Atomic.cuh")):
+            return ['-DATEN_ATOMIC_HEADER']
+        else:
+            return []
+
+    def generator_args(self):
+        generator_flag = []
+        torch_dir = torch.__path__[0]
+        if os.path.exists(os.path.join(torch_dir, "include", "ATen", "CUDAGeneratorImpl.h")):
+            generator_flag = ["-DOLD_GENERATOR_PATH"]
+        return generator_flag
+
+    def nvcc_threads_args(self):
+        cuda_major, cuda_minor = installed_cuda_version()
+        if cuda_major >= 11 and cuda_minor >= 2:
+            return ["--threads", "4"]
+        return []
+
+    def nccl_args(self):
+        nccl_library = ["-lnccl"]
+        if self.is_rocm_pytorch():
+            nccl_library = ["-lrccl"]
+        return nccl_library
+
+    def nccl_version(self):
+        return torch.cuda.nccl.version()[0:2]
+
+    def torch_version(self):
+        return (TORCH_MAJOR, TORCH_MINOR)
+
+    def is_supported(self):
+        return super().is_supported()
+
+class CPUOpBuilder(CUDAOpBuilder):
+
+    def get_cuda_lib64_path(self):
+        import torch
+        if not self.is_rocm_pytorch():
+            CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
+            if not os.path.exists(CUDA_LIB64):
+                CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib")
+        else:
+            CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.ROCM_HOME, "lib")
+        return CUDA_LIB64
+
+    def extra_ldflags(self):
+        if self.build_for_cpu:
+            return ['-fopenmp']
+
+        if not self.is_rocm_pytorch():
+            ld_flags = ['-lcurand']
+            if not self.build_for_cpu:
+                ld_flags.append(f'-L{self.get_cuda_lib64_path()}')
+            return ld_flags
+
+        return []
+
+    def cxx_args(self):
+        args = []
+        if not self.build_for_cpu:
+            CUDA_LIB64 = self.get_cuda_lib64_path()
+
+            args += super().cxx_args()
+            args += [
+                f'-L{CUDA_LIB64}',
+                '-lcudart',
+                '-lcublas',
+                '-g',
+            ]
+
+        CPU_ARCH = self.cpu_arch()
+        SIMD_WIDTH = self.simd_width()
+        CUDA_ENABLE = self.get_cuda_compile_flag()
+        args += [
+            CPU_ARCH,
+            '-fopenmp',
+            SIMD_WIDTH,
+            CUDA_ENABLE,
+        ]
+
+        return args
\ No newline at end of file
diff --git a/op_builder/distributed_adam.py b/op_builder/distributed_adam.py
new file mode 100644
index 000000000..ef453bee9
--- /dev/null
+++ b/op_builder/distributed_adam.py
@@ -0,0 +1,33 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class DistributedAdamBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_DISTRIBUTED_ADAM'
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
+    NAME = "distributed_adam_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ['contrib/csrc/optimizers/multi_tensor_distopt_adam.cpp',
+                'contrib/csrc/optimizers/multi_tensor_distopt_adam_kernel.cu']
+
+    def include_paths(self):
+        return ['contrib/csrc/',
+                'csrc']
+        
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = ['-O3'] + self.version_dependent_macros()
+        if not self.is_rocm_pytorch():
+            nvcc_flags += ['--use_fast_math']
+        return nvcc_flags
\ No newline at end of file
diff --git a/op_builder/distributed_lamb.py b/op_builder/distributed_lamb.py
new file mode 100644
index 000000000..74d77d129
--- /dev/null
+++ b/op_builder/distributed_lamb.py
@@ -0,0 +1,33 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class DistributedLambBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_DISTRIBUTED_LAMB'
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
+    NAME = "distributed_lamb_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ['contrib/csrc/optimizers/multi_tensor_distopt_lamb.cpp',
+                'contrib/csrc/optimizers/multi_tensor_distopt_lamb_kernel.cu']
+
+    def include_paths(self):
+        return ['contrib/csrc/',
+                'csrc']
+        
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = ['-O3'] + self.version_dependent_macros()
+        if not self.is_rocm_pytorch():
+            nvcc_flags += ['--use_fast_math']
+        return nvcc_flags
\ No newline at end of file
diff --git a/op_builder/fast_multihead_attn.py b/op_builder/fast_multihead_attn.py
new file mode 100644
index 000000000..0f2f8b52f
--- /dev/null
+++ b/op_builder/fast_multihead_attn.py
@@ -0,0 +1,50 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class FastMultiheadAttnBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_FAST_MULTIHEAD_ATTN'
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
+    NAME = "fast_multihead_attn"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ['contrib/csrc/multihead_attn/multihead_attn_frontend.cpp',
+                    'contrib/csrc/multihead_attn/additive_masked_softmax_dropout_cuda.cu',
+                    "contrib/csrc/multihead_attn/masked_softmax_dropout_cuda.cu",
+                    "contrib/csrc/multihead_attn/encdec_multihead_attn_cuda.cu",
+                    "contrib/csrc/multihead_attn/encdec_multihead_attn_norm_add_cuda.cu",
+                    "contrib/csrc/multihead_attn/self_multihead_attn_cuda.cu",
+                    "contrib/csrc/multihead_attn/self_multihead_attn_bias_additive_mask_cuda.cu",
+                    "contrib/csrc/multihead_attn/self_multihead_attn_bias_cuda.cu",
+                    "contrib/csrc/multihead_attn/self_multihead_attn_norm_add_cuda.cu"]
+
+    def include_paths(self):
+        return ['csrc/',
+                'contrib/csrc/',
+                'contrib/csrc/multihead_attn']
+        
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros() + self.generator_args()
+
+    def nvcc_args(self):
+        nvcc_flags = ['-O3'] + self.version_dependent_macros() + self.generator_args() 
+        if not self.is_rocm_pytorch():
+            nvcc_flags += ['-U__CUDA_NO_HALF_OPERATORS__',
+                         '-U__CUDA_NO_HALF_CONVERSIONS__',
+                         '--expt-relaxed-constexpr',
+                         '--expt-extended-lambda',
+                         '--use_fast_math'] + self.compute_capability_args()
+        else:
+            nvcc_flags += ['-I/opt/rocm/include/hiprand',
+                          '-I/opt/rocm/include/rocrand',
+                          '-U__HIP_NO_HALF_OPERATORS__',
+                          '-U__HIP_NO_HALF_CONVERSIONS__'] + self.backward_pass_guard_args()
+        return nvcc_flags
\ No newline at end of file
diff --git a/op_builder/focal_loss.py b/op_builder/focal_loss.py
new file mode 100644
index 000000000..98a21330a
--- /dev/null
+++ b/op_builder/focal_loss.py
@@ -0,0 +1,33 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class FocalLossBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_FOCAL_LOSS'
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
+    NAME = "focal_loss_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ['contrib/csrc/focal_loss/focal_loss_cuda.cpp',
+                'contrib/csrc/focal_loss/focal_loss_cuda_kernel.cu']
+
+    def include_paths(self):
+        return ['contrib/csrc/' ]
+        
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        if self.is_rocm_pytorch():
+            nvcc_flags = ['-O3'] + self.version_dependent_macros()
+        else:
+            nvcc_flags = ['-O3', '--ftz=false', '--use_fast_math']
+        return nvcc_flags
\ No newline at end of file
diff --git a/op_builder/fused_adam.py b/op_builder/fused_adam.py
new file mode 100644
index 000000000..f335368d8
--- /dev/null
+++ b/op_builder/fused_adam.py
@@ -0,0 +1,33 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class FusedAdamBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_FUSED_ADAM'
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
+    NAME = "fused_adam_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ['contrib/csrc/optimizers/fused_adam_cuda.cpp',
+                'contrib/csrc/optimizers/fused_adam_cuda_kernel.cu']
+
+    def include_paths(self):
+        return ['contrib/csrc/',
+                'csrc']
+        
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = ['-O3'] + self.version_dependent_macros()
+        if not self.is_rocm_pytorch():
+            nvcc_flags += ['--use_fast_math']
+        return nvcc_flags
\ No newline at end of file
diff --git a/op_builder/fused_bias_swiglu.py b/op_builder/fused_bias_swiglu.py
new file mode 100644
index 000000000..4a7d13881
--- /dev/null
+++ b/op_builder/fused_bias_swiglu.py
@@ -0,0 +1,57 @@
+from .builder import CUDAOpBuilder
+import sys
+import os
+
+class FusedBiasSwiGLUBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_FUSED_BIAS_SWIGLU'
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
+    NAME = "fused_bias_swiglu"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return [
+            "csrc/megatron/fused_bias_swiglu.cpp",
+            "csrc/megatron/fused_bias_swiglu_cuda.cu"
+        ]
+
+    def include_paths(self):
+        return ['csrc']
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = [
+                '-O3',
+                '-U__CUDA_NO_HALF_OPERATORS__',
+                '-U__CUDA_NO_HALF_CONVERSIONS__'
+            ] + self.version_dependent_macros()
+        if not self.is_rocm_pytorch():
+            nvcc_flags.extend(
+                [
+                    '--expt-relaxed-constexpr',
+                    '--expt-extended-lambda'
+                ])
+        else:
+            # Handle ROCm arch flags
+            amdgpu_targets = os.environ.get('PYTORCH_ROCM_ARCH', '')
+            if not amdgpu_targets:
+                print("Warning: PYTORCH_ROCM_ARCH environment variable is empty.")
+                print("Using default architecture. Set this variable for specific GPU targets.")
+                print("Example: export PYTORCH_ROCM_ARCH=gfx906")
+                amdgpu_targets = "gfx906"
+            try:
+                for amdgpu_target in amdgpu_targets.split(';'):
+                    if amdgpu_target:
+                        nvcc_flags += [f'--offload-arch={amdgpu_target}']
+            except Exception as e:
+                print(f"Warning: Error processing PYTORCH_ROCM_ARCH: {e}")
+                print("Falling back to default architecture gfx906")
+                nvcc_flags += ['--offload-arch=gfx906']
+        return nvcc_flags
\ No newline at end of file
diff --git a/op_builder/fused_dense.py b/op_builder/fused_dense.py
new file mode 100644
index 000000000..4d40eef6d
--- /dev/null
+++ b/op_builder/fused_dense.py
@@ -0,0 +1,28 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class FusedDenseBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_FUSED_DENSE'
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
+    NAME = "fused_dense_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ['csrc/fused_dense_base.cpp', 'csrc/fused_dense_cuda.cu']
+
+    def include_paths(self):
+        return ['csrc']
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        return ['-O3'] + self.version_dependent_macros()
\ No newline at end of file
diff --git a/op_builder/fused_index_mul_2d.py b/op_builder/fused_index_mul_2d.py
new file mode 100644
index 000000000..d04564e15
--- /dev/null
+++ b/op_builder/fused_index_mul_2d.py
@@ -0,0 +1,34 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class FusedIndexMul2dBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_FUSED_INDEX_MUL_2D'
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
+    NAME = "fused_index_mul_2d"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ['contrib/csrc/index_mul_2d/index_mul_2d_cuda.cpp',
+                'contrib/csrc/index_mul_2d/index_mul_2d_cuda_kernel.cu']
+
+    def include_paths(self):
+        return ['contrib/csrc/']
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = ['-O3'] + self.version_dependent_macros()
+        if not self.is_rocm_pytorch():
+            nvcc_flags += ['--use_fast_math', '--ftz=false']
+        else:
+            nvcc_flags += self.aten_atomic_args()
+        return nvcc_flags
\ No newline at end of file
diff --git a/op_builder/fused_lamb.py b/op_builder/fused_lamb.py
new file mode 100644
index 000000000..02a0b6fe7
--- /dev/null
+++ b/op_builder/fused_lamb.py
@@ -0,0 +1,34 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class FusedLambBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_FUSED_LAMB'
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
+    NAME = "fused_lamb_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ['contrib/csrc/optimizers/fused_lamb_cuda.cpp',
+                'contrib/csrc/optimizers/fused_lamb_cuda_kernel.cu',
+                'csrc/multi_tensor_l2norm_kernel.cu']
+
+    def include_paths(self):
+        return ['contrib/csrc/',
+                'csrc']
+        
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = ['-O3'] + self.version_dependent_macros()
+        if not self.is_rocm_pytorch():
+            nvcc_flags += ['--use_fast_math']
+        return nvcc_flags
\ No newline at end of file
diff --git a/op_builder/fused_layer_norm.py b/op_builder/fused_layer_norm.py
new file mode 100644
index 000000000..66130f17b
--- /dev/null
+++ b/op_builder/fused_layer_norm.py
@@ -0,0 +1,31 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class FusedLayerNormBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_FUSED_LAYER_NORM'
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
+    NAME = "fused_layer_norm_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ['csrc/layer_norm_cuda.cpp', 'csrc/layer_norm_cuda_kernel.cu']
+
+    def include_paths(self):
+        return ['csrc']
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = ['-O3'] + self.version_dependent_macros()
+        if not self.is_rocm_pytorch():
+            nvcc_flags.extend(['--use_fast_math', '-maxrregcount=50'])
+        return nvcc_flags
\ No newline at end of file
diff --git a/op_builder/fused_rope.py b/op_builder/fused_rope.py
new file mode 100644
index 000000000..c87f14b84
--- /dev/null
+++ b/op_builder/fused_rope.py
@@ -0,0 +1,40 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class FusedRopeBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_FUSED_ROPE'
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
+    NAME = "fused_rotary_positional_embedding"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ["csrc/megatron/fused_rotary_positional_embedding.cpp",
+                "csrc/megatron/fused_rotary_positional_embedding_cuda.cu"]
+
+    def include_paths(self):
+        return ['csrc', 'csrc/megatron']
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = [
+                '-O3',
+                '-U__CUDA_NO_HALF_OPERATORS__',
+                '-U__CUDA_NO_HALF_CONVERSIONS__'
+            ] + self.version_dependent_macros()
+        if not self.is_rocm_pytorch():
+            nvcc_flags.extend(
+                [
+                    '--expt-relaxed-constexpr',
+                    '--expt-extended-lambda'
+                ])
+        return nvcc_flags
\ No newline at end of file
diff --git a/op_builder/fused_weight_gradient_mlp.py b/op_builder/fused_weight_gradient_mlp.py
new file mode 100644
index 000000000..b6d595385
--- /dev/null
+++ b/op_builder/fused_weight_gradient_mlp.py
@@ -0,0 +1,42 @@
+from .builder import CUDAOpBuilder
+
+class FusedWeightGradientMlpCudaBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_FUSED_WEIGHT_GRADIENT_MLP'
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
+    NAME = "fused_weight_gradient_mlp_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return [
+            "csrc/megatron/fused_weight_gradient_dense.cpp",
+            "csrc/megatron/fused_weight_gradient_dense_cuda.cu",
+            "csrc/megatron/fused_weight_gradient_dense_16bit_prec_cuda.cu",
+        ]
+
+    def include_paths(self):
+        # Both csrc and csrc/megatron are included in the original extension
+        return ['csrc', 'csrc/megatron']
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = [
+                '-O3',
+                '-U__CUDA_NO_HALF_OPERATORS__',
+                '-U__CUDA_NO_HALF_CONVERSIONS__'
+            ] + self.version_dependent_macros()
+        if not self.is_rocm_pytorch():
+            nvcc_flags.extend(
+                [
+                    '--expt-relaxed-constexpr',
+                    '--expt-extended-lambda',
+                    "--use_fast_math"
+                ]) + self.compute_capability_args()
+        return nvcc_flags
\ No newline at end of file
diff --git a/op_builder/generic_scaled_masked_softmax_cuda.py b/op_builder/generic_scaled_masked_softmax_cuda.py
new file mode 100644
index 000000000..a0fb2d5fc
--- /dev/null
+++ b/op_builder/generic_scaled_masked_softmax_cuda.py
@@ -0,0 +1,39 @@
+from .builder import CUDAOpBuilder
+
+class GenericScaledMaskedSoftmaxCudaBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_GENERIC_SCALED_MASKED_SOFTMAX_CUDA'
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
+    NAME = "generic_scaled_masked_softmax_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return [
+            "csrc/megatron/generic_scaled_masked_softmax_cpu.cpp",
+            "csrc/megatron/generic_scaled_masked_softmax_cuda.cu"
+        ]
+
+    def include_paths(self):
+        return ['csrc']
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = [
+                '-O3',
+                '-U__CUDA_NO_HALF_OPERATORS__',
+                '-U__CUDA_NO_HALF_CONVERSIONS__'
+            ] + self.version_dependent_macros()
+        if not self.is_rocm_pytorch():
+            nvcc_flags.extend(
+                [
+                    '--expt-relaxed-constexpr',
+                    '--expt-extended-lambda'
+                ])
+        return nvcc_flags
\ No newline at end of file
diff --git a/op_builder/mlp.py b/op_builder/mlp.py
new file mode 100644
index 000000000..c6a177721
--- /dev/null
+++ b/op_builder/mlp.py
@@ -0,0 +1,32 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class MlpBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_MLP'
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
+    NAME = "mlp_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ['csrc/mlp.cpp',
+                'csrc/mlp_cuda.cu']
+
+    def include_paths(self):
+        return ['csrc']
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = ['-O3'] + self.version_dependent_macros()
+        if self.is_rocm_pytorch():
+            nvcc_flags.extend(self.backward_pass_guard_args())
+        return nvcc_flags 
\ No newline at end of file
diff --git a/op_builder/nccl_allocator.py b/op_builder/nccl_allocator.py
new file mode 100644
index 000000000..320e76476
--- /dev/null
+++ b/op_builder/nccl_allocator.py
@@ -0,0 +1,36 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class NCCLAllocatorBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_NCCL_ALLOCATOR'
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
+    NAME = "_apex_nccl_allocator"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ["contrib/csrc/nccl_allocator/NCCLAllocator.cpp"]
+
+    def include_paths(self):
+        return ['contrib/csrc/']
+        
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros() + self.generator_args()
+
+    def nvcc_args(self):
+        return self.nccl_args()
+
+    def is_compatible(self, verbose=False):
+        torch_version = self.torch_version()
+        if torch_version >= (2, 6):
+            available_nccl_version = self.nccl_version()
+            if available_nccl_version >= (2, 19):
+                return True
+        return False
\ No newline at end of file
diff --git a/op_builder/nccl_p2p.py b/op_builder/nccl_p2p.py
new file mode 100644
index 000000000..37772572e
--- /dev/null
+++ b/op_builder/nccl_p2p.py
@@ -0,0 +1,26 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class NCCLP2PBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_NCCL_P2P'
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
+    NAME = "nccl_p2p_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ["contrib/csrc/nccl_p2p/nccl_p2p_cuda.cu",
+                "contrib/csrc/nccl_p2p/nccl_p2p.cpp"]
+
+    def include_paths(self):
+        return ['contrib/csrc/']
+        
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros() + self.generator_args()
\ No newline at end of file
diff --git a/op_builder/peer_memory.py b/op_builder/peer_memory.py
new file mode 100644
index 000000000..c869f0be6
--- /dev/null
+++ b/op_builder/peer_memory.py
@@ -0,0 +1,26 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class PeerMemoryBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_PEER_MEMORY'
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
+    NAME = "peer_memory_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ["contrib/csrc/peer_memory/peer_memory_cuda.cu",
+                "contrib/csrc/peer_memory/peer_memory.cpp"]
+
+    def include_paths(self):
+        return ['contrib/csrc/']
+        
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros() + self.generator_args()
\ No newline at end of file
diff --git a/op_builder/scaled_masked_softmax_cuda.py b/op_builder/scaled_masked_softmax_cuda.py
new file mode 100644
index 000000000..1013ef8d2
--- /dev/null
+++ b/op_builder/scaled_masked_softmax_cuda.py
@@ -0,0 +1,40 @@
+from .builder import CUDAOpBuilder
+
+class ScaledMaskedSoftmaxCudaBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_SCALED_MASKED_SOFTMAX_CUDA'
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
+    NAME = "scaled_masked_softmax_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return [
+            "csrc/megatron/scaled_masked_softmax_cpu.cpp",
+            "csrc/megatron/scaled_masked_softmax_cuda.cu"
+        ]
+
+    def include_paths(self):
+        # Both csrc and csrc/megatron are included in the original extension
+        return ['csrc', 'csrc/megatron']
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = [
+                '-O3',
+                '-U__CUDA_NO_HALF_OPERATORS__',
+                '-U__CUDA_NO_HALF_CONVERSIONS__'
+            ] + self.version_dependent_macros()
+        if not self.is_rocm_pytorch():
+            nvcc_flags.extend(
+                [
+                    '--expt-relaxed-constexpr',
+                    '--expt-extended-lambda'
+                ])
+        return nvcc_flags
\ No newline at end of file
diff --git a/op_builder/scaled_softmax_cuda.py b/op_builder/scaled_softmax_cuda.py
new file mode 100644
index 000000000..f29543963
--- /dev/null
+++ b/op_builder/scaled_softmax_cuda.py
@@ -0,0 +1,41 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+class ScaledSoftmaxCudaBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_SCALED_SOFTMAX_CUDA'
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
+    NAME = "scaled_softmax_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return [
+            "csrc/megatron/scaled_softmax_cpu.cpp",
+            "csrc/megatron/scaled_softmax_cuda.cu"
+        ]
+
+    def include_paths(self):
+        return ['csrc']
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = [
+                '-O3',
+                '-U__CUDA_NO_HALF_OPERATORS__',
+                '-U__CUDA_NO_HALF_CONVERSIONS__'
+            ] + self.version_dependent_macros()
+        if not self.is_rocm_pytorch():
+            nvcc_flags.extend(
+                [
+                    '--expt-relaxed-constexpr',
+                    '--expt-extended-lambda'
+                ])
+        return nvcc_flags
\ No newline at end of file
diff --git a/op_builder/scaled_upper_triang_masked_softmax_cuda.py b/op_builder/scaled_upper_triang_masked_softmax_cuda.py
new file mode 100644
index 000000000..3c2273ad9
--- /dev/null
+++ b/op_builder/scaled_upper_triang_masked_softmax_cuda.py
@@ -0,0 +1,39 @@
+from .builder import CUDAOpBuilder
+
+class ScaledUpperTriangMaskedSoftmaxCudaBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_SCALED_UPPER_TRIANG_MASKED_SOFTMAX_CUDA'
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
+    NAME = "scaled_upper_triang_masked_softmax_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return [
+            "csrc/megatron/scaled_upper_triang_masked_softmax_cpu.cpp",
+            "csrc/megatron/scaled_upper_triang_masked_softmax_cuda.cu"
+        ]
+
+    def include_paths(self):
+        return ['csrc']
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = [
+                '-O3',
+                '-U__CUDA_NO_HALF_OPERATORS__',
+                '-U__CUDA_NO_HALF_CONVERSIONS__'
+            ] + self.version_dependent_macros()
+        if not self.is_rocm_pytorch():
+            nvcc_flags.extend(
+                [
+                    '--expt-relaxed-constexpr',
+                    '--expt-extended-lambda'
+                ])
+        return nvcc_flags
\ No newline at end of file
diff --git a/op_builder/syncbn.py b/op_builder/syncbn.py
new file mode 100644
index 000000000..251c33e01
--- /dev/null
+++ b/op_builder/syncbn.py
@@ -0,0 +1,28 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class SyncBnBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_SYNCBN'
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
+    NAME = "syncbn"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ['csrc/syncbn.cpp', 'csrc/welford.cu']
+
+    def include_paths(self):
+        return ['csrc']
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        return ['-O3'] + self.version_dependent_macros()
\ No newline at end of file
diff --git a/op_builder/transducer_joint.py b/op_builder/transducer_joint.py
new file mode 100644
index 000000000..c17f60f7b
--- /dev/null
+++ b/op_builder/transducer_joint.py
@@ -0,0 +1,33 @@
+from .builder import CUDAOpBuilder
+import sys
+
+
+class TransducerJointBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_TRANSDUCER_JOINT'
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
+    NAME = "transducer_joint_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ["contrib/csrc/transducer/transducer_joint.cpp",
+                "contrib/csrc/transducer/transducer_joint_kernel.cu"]
+
+    def include_paths(self):
+        return ['contrib/csrc/',
+                #it uses philox.cuh from contrib/csrc/multihead_attn
+                'contrib/csrc/multihead_attn'] 
+        
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros() + self.generator_args() 
+
+    def nvcc_args(self):
+        nvcc_flags = ['-O3'] + self.version_dependent_macros() + self.generator_args() 
+        if not self.is_rocm_pytorch():
+            nvcc_flags += self.nvcc_threads_args()
+        return nvcc_flags
\ No newline at end of file
diff --git a/op_builder/transducer_loss.py b/op_builder/transducer_loss.py
new file mode 100644
index 000000000..53ae4eaac
--- /dev/null
+++ b/op_builder/transducer_loss.py
@@ -0,0 +1,31 @@
+from .builder import CUDAOpBuilder
+import sys
+
+
+class TransducerLossBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_TRANSDUCER_LOSS'
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
+    NAME = "transducer_loss_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ["contrib/csrc/transducer/transducer_loss.cpp",
+                "contrib/csrc/transducer/transducer_loss_kernel.cu"]
+
+    def include_paths(self):
+        return ['contrib/csrc/' ]
+        
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = ['-O3'] + self.version_dependent_macros() 
+        if not self.is_rocm_pytorch():
+            nvcc_flags += self.nvcc_threads_args()
+        return nvcc_flags
\ No newline at end of file
diff --git a/op_builder/xentropy.py b/op_builder/xentropy.py
new file mode 100644
index 000000000..84f3ddf12
--- /dev/null
+++ b/op_builder/xentropy.py
@@ -0,0 +1,29 @@
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class XentropyBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'APEX_BUILD_XENTROPY'
+    INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS"
+    NAME = "xentropy_cuda"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'apex.{self.NAME}'
+
+    def sources(self):
+        return ['contrib/csrc/xentropy/interface.cpp',
+                'contrib/csrc/xentropy/xentropy_kernel.cu']
+
+    def include_paths(self):
+        return ['csrc', 'contrib/csrc/' ]
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        return ['-O3'] + self.version_dependent_macros()
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 241f90a94..d527b4498 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,4 +5,8 @@ PyYAML>=5.1
 pytest>=3.5.1
 packaging>=14.0
 matplotlib>=3.8
-pandas>=2.2.2
\ No newline at end of file
+pandas>=2.2.2
+py-cpuinfo
+build
+ninja
+wheel
\ No newline at end of file
diff --git a/scripts/clean.py b/scripts/clean.py
new file mode 100644
index 000000000..be7e69798
--- /dev/null
+++ b/scripts/clean.py
@@ -0,0 +1,16 @@
+import torch.utils.cpp_extension
+import shutil
+import os
+import sys
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from op_builder.all_ops import ALL_OPS
+
+torch_ext_directory = torch.utils.cpp_extension._get_build_directory("", False)
+
+install_ops = dict.fromkeys(ALL_OPS.keys(), False)
+for op_name, builder in ALL_OPS.items():
+    path = os.path.join(torch_ext_directory, op_name)
+    if os.path.exists(path):
+        print ("removing torch extension", op_name, "at", torch_ext_directory)
+        shutil.rmtree(path)
\ No newline at end of file
diff --git a/setup.py b/setup.py
index c4044a0a3..816849e7c 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@
 import glob
 from packaging.version import parse, Version
 
-from setuptools import setup, find_packages
+from setuptools import setup, find_packages, Distribution
 import subprocess
 
 import torch
@@ -17,44 +17,17 @@
         load,
      )
 
+import typing
+import shlex
+
+sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
+
+from op_builder.all_ops import ALL_OPS
+import shutil
 
 # ninja build does not work unless include_dirs are abs path
 this_dir = os.path.dirname(os.path.abspath(__file__))
-torch_dir = torch.__path__[0]
-
-
-# https://github.com/pytorch/pytorch/pull/71881
-# For the extensions which have rocblas_gemm_flags_fp16_alt_impl we need to make sure if at::BackwardPassGuard exists.
-# It helps the extensions be backward compatible with old PyTorch versions.
-# The check and ROCM_BACKWARD_PASS_GUARD in nvcc/hipcc args can be retired once the PR is merged into PyTorch upstream.
-
-context_file = os.path.join(torch_dir, "include", "ATen", "Context.h")
-if os.path.exists(context_file):
-    lines = open(context_file, 'r').readlines()
-    found_Backward_Pass_Guard = False
-    found_ROCmBackward_Pass_Guard = False
-    for line in lines:
-        if "BackwardPassGuard" in line:
-            # BackwardPassGuard has been renamed to ROCmBackwardPassGuard
-            # https://github.com/pytorch/pytorch/pull/71881/commits/4b82f5a67a35406ffb5691c69e6b4c9086316a43
-            if "ROCmBackwardPassGuard" in line:
-                found_ROCmBackward_Pass_Guard = True
-            else:
-                found_Backward_Pass_Guard = True
-            break
-
-found_aten_atomic_header = False
-if os.path.exists(os.path.join(torch_dir, "include", "ATen", "Atomic.cuh")):
-    found_aten_atomic_header = True
-
-def raise_if_cuda_home_none(global_option: str) -> None:
-    if CUDA_HOME is not None or ROCM_HOME is not None:
-        return
-    raise RuntimeError(
-        f"{global_option} was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  "
-        "If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, "
-        "only images whose names contain 'devel' will provide nvcc."
-    )
+
 
 def get_cuda_bare_metal_version(cuda_dir):
     raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
@@ -74,50 +47,6 @@ def get_rocm_bare_metal_version(rocm_dir):
     bare_metal_minor = release[1][0]
     return raw_output, bare_metal_major, bare_metal_minor
 
-def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
-    raw_output, bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(cuda_dir)
-    torch_binary_major = torch.version.cuda.split(".")[0]
-    torch_binary_minor = torch.version.cuda.split(".")[1]
-
-    print("\nCompiling cuda extensions with")
-    print(raw_output + "from " + cuda_dir + "/bin\n")
-
-    if (bare_metal_major != torch_binary_major) or (bare_metal_minor != torch_binary_minor):
-        raise RuntimeError(
-            "Cuda extensions are being compiled with a version of Cuda that does "
-            "not match the version used to compile Pytorch binaries.  "
-            "Pytorch binaries were compiled with Cuda {}.\n".format(torch.version.cuda)
-            + "In some cases, a minor-version mismatch will not cause later errors:  "
-            "https://github.com/NVIDIA/apex/pull/323#discussion_r287021798.  "
-            "You can try commenting out this check (at your own risk)."
-        )
-
-def check_rocm_torch_binary_vs_bare_metal(rocm_dir):
-    raw_output, bare_metal_major, bare_metal_minor = get_rocm_bare_metal_version(rocm_dir)
-    torch_binary_major = torch.version.hip.split(".")[0]
-    torch_binary_minor = torch.version.hip.split(".")[1]
-
-    print("\nCompiling rocm extensions with")
-    print(raw_output + "from " + rocm_dir + "/bin\n")
-
-    if (bare_metal_major != torch_binary_major) or (bare_metal_minor != torch_binary_minor):
-        raise RuntimeError(
-            "Cuda extensions are being compiled with a version of Cuda that does "
-            "not match the version used to compile Pytorch binaries.  "
-            "Pytorch binaries were compiled with Cuda {}.\n".format(torch.version.cuda)
-            + "In some cases, a minor-version mismatch will not cause later errors:  "
-            "https://github.com/NVIDIA/apex/pull/323#discussion_r287021798.  "
-            "You can try commenting out this check (at your own risk)."
-        )
-
-def raise_if_home_none(global_option: str) -> None:
-    if CUDA_HOME is not None or ROCM_HOME is not None:
-        return
-    raise RuntimeError(
-        f"{global_option} was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  "
-        "If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, "
-        "only images whose names contain 'devel' will provide nvcc."
-    )
 
 def get_apex_version():
     cwd = os.path.dirname(os.path.abspath(__file__))
@@ -135,23 +64,6 @@ def get_apex_version():
             apex_version += ".git"+os.getenv("APEX_COMMIT")[:8]
     return apex_version
 
-def append_nvcc_threads(nvcc_extra_args):
-    _, bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(CUDA_HOME)
-    if int(bare_metal_major) >= 11 and int(bare_metal_minor) >= 2:
-        return nvcc_extra_args + ["--threads", "4"]
-    return nvcc_extra_args
-
-
-def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int) -> bool:
-    cudnn_available = torch.backends.cudnn.is_available()
-    cudnn_version = torch.backends.cudnn.version() if cudnn_available else None
-    if not (cudnn_available and (cudnn_version >= required_cudnn_version)):
-        warnings.warn(
-            f"Skip `{global_option}` as it requires cuDNN {required_cudnn_version} or later, "
-            f"but {'cuDNN is not available' if not cudnn_available else cudnn_version}"
-        )
-        return False
-    return True
 
 print("\n\ntorch.__version__  = {}\n\n".format(torch.__version__))
 TORCH_MAJOR = int(torch.__version__.split('.')[0])
@@ -202,779 +114,154 @@ def check_if_rocm_pytorch():
     )
 
 # cmdclass = {}
-ext_modules = []
-
 extras = {}
 
-# Set up macros for forward/backward compatibility hack around
-# https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e
-# and
-# https://github.com/NVIDIA/apex/issues/456
-# https://github.com/pytorch/pytorch/commit/eb7b39e02f7d75c26d8a795ea8c7fd911334da7e#diff-4632522f237f1e4e728cb824300403ac
-version_ge_1_1 = []
-if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
-    version_ge_1_1 = ["-DVERSION_GE_1_1"]
-version_ge_1_3 = []
-if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
-    version_ge_1_3 = ["-DVERSION_GE_1_3"]
-version_ge_1_5 = []
-if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
-    version_ge_1_5 = ["-DVERSION_GE_1_5"]
-version_dependent_macros = version_ge_1_1 + version_ge_1_3 + version_ge_1_5
-
 if not IS_ROCM_PYTORCH:
     _, bare_metal_version = get_cuda_bare_metal_version(CUDA_HOME)
 else:
     _, bare_metal_version, bare_metal_minor  = get_rocm_bare_metal_version(ROCM_HOME)
 
-if IS_ROCM_PYTORCH and (ROCM_MAJOR >= 6):
-    version_dependent_macros += ["-DHIPBLAS_V2"] 
 
+# ***************************** Op builder **********************
+
+def get_env_if_set(key, default: typing.Any = ""):
+    """
+    Returns an environment variable if it is set and not "",
+    otherwise returns a default value. In contrast, the fallback
+    parameter of os.environ.get() is skipped if the variable is set to "".
+    """
+    return os.environ.get(key, None) or default
 
-if "--cpp_ext" in sys.argv or "--cuda_ext" in sys.argv:
+def command_exists(cmd):
+    if sys.platform == "win32":
+        safe_cmd = shlex.split(f'{cmd}')
+        result = subprocess.Popen(safe_cmd, stdout=subprocess.PIPE)
+        return result.wait() == 1
+    else:
+        safe_cmd = shlex.split(f"bash -c type {cmd}")
+        result = subprocess.Popen(safe_cmd, stdout=subprocess.PIPE)
+        return result.wait() == 0
+
+BUILD_OP_DEFAULT = 0
+BUILD_CPP_OPS = int(get_env_if_set('APEX_BUILD_CPP_OPS', BUILD_OP_DEFAULT))
+BUILD_CUDA_OPS = int(get_env_if_set('APEX_BUILD_CUDA_OPS', BUILD_OP_DEFAULT))
+build_flags = {
+    "APEX_BUILD_CPP_OPS" : BUILD_CPP_OPS,
+    "APEX_BUILD_CUDA_OPS" : BUILD_CUDA_OPS,
+    }
+
+if BUILD_CPP_OPS or BUILD_CUDA_OPS:
     if TORCH_MAJOR == 0:
         raise RuntimeError("--cpp_ext requires Pytorch 1.0 or later, "
                            "found torch.__version__ = {}".format(torch.__version__)
                            )
 
-if "--cpp_ext" in sys.argv:
-    sys.argv.remove("--cpp_ext")
-    ext_modules.append(CppExtension("apex_C", ["csrc/flatten_unflatten.cpp"]))
-
-if "--distributed_adam" in sys.argv or "--cuda_ext" in sys.argv:
-    if "--distributed_adam" in sys.argv:
-        sys.argv.remove("--distributed_adam")
-
-    raise_if_home_none("--distributed_adam")
-    nvcc_args_adam = ['-O3', '--use_fast_math'] + version_dependent_macros
-    hipcc_args_adam = ['-O3'] + version_dependent_macros
-    ext_modules.append(
-        CUDAExtension(
-            name='distributed_adam_cuda',
-            sources=[
-                'apex/contrib/csrc/optimizers/multi_tensor_distopt_adam.cpp',
-                'apex/contrib/csrc/optimizers/multi_tensor_distopt_adam_kernel.cu',
-            ],
-            include_dirs=[
-                os.path.join(this_dir, 'csrc'),
-                os.path.join(this_dir, 'apex/contrib/csrc/optimizers'),
-            ],
-            extra_compile_args={
-                'cxx': ['-O3',] + version_dependent_macros,
-                'nvcc':nvcc_args_adam if not IS_ROCM_PYTORCH else hipcc_args_adam,
-            }
-        )
-    )
-
-if "--distributed_lamb" in sys.argv or "--cuda_ext" in sys.argv:
-    if "--distributed_lamb" in sys.argv:
-        sys.argv.remove("--distributed_lamb")
-
-    raise_if_home_none("--distributed_lamb")
-
-    print ("INFO: Building the distributed_lamb extension.")
-    nvcc_args_distributed_lamb = ['-O3', '--use_fast_math'] + version_dependent_macros
-    hipcc_args_distributed_lamb = ['-O3'] + version_dependent_macros
-    ext_modules.append(
-        CUDAExtension(
-            name='distributed_lamb_cuda',
-            sources=[
-                'apex/contrib/csrc/optimizers/multi_tensor_distopt_lamb.cpp',
-                'apex/contrib/csrc/optimizers/multi_tensor_distopt_lamb_kernel.cu',
-            ],
-            include_dirs=[os.path.join(this_dir, 'csrc')],
-            extra_compile_args={
-                'cxx': ['-O3',] + version_dependent_macros,
-                'nvcc': nvcc_args_distributed_lamb if not IS_ROCM_PYTORCH else hipcc_args_distributed_lamb,
-                }
-            )
-        )
-
-    
-if "--cuda_ext" in sys.argv:
-    raise_if_home_none("--cuda_ext")
-    
-    if not IS_ROCM_PYTORCH:
-        check_cuda_torch_binary_vs_bare_metal(CUDA_HOME)
-    else:
-        check_rocm_torch_binary_vs_bare_metal(ROCM_HOME)
-
-#**********  multi-tensor apply  ****************
-    print ("INFO: Building the multi-tensor apply extension.")
-    nvcc_args_multi_tensor = ['-lineinfo', '-O3', '--use_fast_math'] + version_dependent_macros
-    hipcc_args_multi_tensor = ['-O3'] + version_dependent_macros
-    ext_modules.append(
-        CUDAExtension(
-            name='amp_C',
-            sources=[
-                'csrc/amp_C_frontend.cpp',
-                'csrc/multi_tensor_sgd_kernel.cu',
-                'csrc/multi_tensor_scale_kernel.cu',
-                'csrc/multi_tensor_axpby_kernel.cu',
-                'csrc/multi_tensor_l2norm_kernel.cu',
-                'csrc/multi_tensor_l2norm_kernel_mp.cu',
-                'csrc/multi_tensor_l2norm_scale_kernel.cu',
-                'csrc/multi_tensor_lamb_stage_1.cu',
-                'csrc/multi_tensor_lamb_stage_2.cu',
-                'csrc/multi_tensor_adam.cu',
-                'csrc/multi_tensor_adagrad.cu',
-                'csrc/multi_tensor_novograd.cu',
-                'csrc/multi_tensor_lars.cu',
-                'csrc/multi_tensor_lamb.cu',
-                'csrc/multi_tensor_lamb_mp.cu'],
-            include_dirs=[os.path.join(this_dir, 'csrc')],
-            extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
-                                'nvcc': nvcc_args_multi_tensor if not IS_ROCM_PYTORCH else hipcc_args_multi_tensor,
-                                }
-            )
-        )
-
-#**********  syncbn  ****************
-    print("INFO: Building syncbn extension.")
-    ext_modules.append(
-        CUDAExtension(
-            name='syncbn',
-            sources=[
-                'csrc/syncbn.cpp',
-                'csrc/welford.cu',
-            ],
-            include_dirs=[os.path.join(this_dir, 'csrc')],
-            extra_compile_args={
-                'cxx': ['-O3'] + version_dependent_macros,
-                'nvcc':['-O3'] + version_dependent_macros,
-                }
-            )
-        )
-
-#**********  fused layernorm  ****************
-    nvcc_args_layer_norm = ['-maxrregcount=50', '-O3', '--use_fast_math'] + version_dependent_macros
-    hipcc_args_layer_norm = ['-O3'] + version_dependent_macros
-
-    print ("INFO: Building fused layernorm extension.")
-    ext_modules.append(
-        CUDAExtension(
-            name='fused_layer_norm_cuda',
-            sources=[
-                'csrc/layer_norm_cuda.cpp',
-                'csrc/layer_norm_cuda_kernel.cu',
-            ],
-            include_dirs=[os.path.join(this_dir, 'csrc')],
-            extra_compile_args={
-                'cxx': ['-O3'] + version_dependent_macros,
-                'nvcc': nvcc_args_layer_norm if not IS_ROCM_PYTORCH else hipcc_args_layer_norm,
-                }
-            )
-        )
-
-#**********  fused dense  ****************
-    ext_modules.append(
-        CUDAExtension(
-            name='fused_dense_cuda',
-            sources=[
-                'csrc/fused_dense_base.cpp',
-                'csrc/fused_dense_cuda.cu',
-            ],
-            extra_compile_args={
-                'cxx': ['-O3'] + version_dependent_macros,
-                'nvcc':['-O3'] + version_dependent_macros
-                }
-            )
-        )
-    
-    bare_metal_version = Version(bare_metal_version)
-    print("Bare Metal Version : ", bare_metal_version)
-    if True:
-
-        cc_flag = []
-        cc_flag.append("-gencode")
-        cc_flag.append("arch=compute_70,code=sm_70")
-        cc_flag.append("-gencode")
-        cc_flag.append("arch=compute_80,code=sm_80")
-        if bare_metal_version >= Version("11.1"):
-            cc_flag.append("-gencode")
-            cc_flag.append("arch=compute_86,code=sm_86")
-        if bare_metal_version >= Version("11.8"):
-            cc_flag.append("-gencode")
-            cc_flag.append("arch=compute_90,code=sm_90")
-
-        nvcc_args_fused_weight_gradient =  [
-                        "-O3",
-                        "-U__CUDA_NO_HALF_OPERATORS__",
-                        "-U__CUDA_NO_HALF_CONVERSIONS__",
-                        "--expt-relaxed-constexpr",
-                        "--expt-extended-lambda",
-                        "--use_fast_math",
-                    ] + version_dependent_macros + cc_flag
-
-        hipcc_args_fused_weight_gradient = [
-                        "-O3",
-                        "-U__CUDA_NO_HALF_OPERATORS__",
-                        "-U__CUDA_NO_HALF_CONVERSIONS__"
-                    ] + version_dependent_macros
-
-        ext_modules.append(
-            CUDAExtension(
-                name="fused_weight_gradient_mlp_cuda",
-                include_dirs=[os.path.join(this_dir, "csrc")],
-                sources=[
-                    "csrc/megatron/fused_weight_gradient_dense.cpp",
-                    "csrc/megatron/fused_weight_gradient_dense_cuda.cu",
-                    "csrc/megatron/fused_weight_gradient_dense_16bit_prec_cuda.cu",
-                ],
-                extra_compile_args={
-                    "cxx": ["-O3"] + version_dependent_macros,
-                    "nvcc": nvcc_args_fused_weight_gradient if not IS_ROCM_PYTORCH else hipcc_args_fused_weight_gradient,
-                },
-            )
-        )
-#**********  mlp_cuda  ****************
-    hipcc_args_mlp = ['-O3'] + version_dependent_macros
-    if found_Backward_Pass_Guard:
-        hipcc_args_mlp = hipcc_args_mlp + ['-DBACKWARD_PASS_GUARD'] + ['-DBACKWARD_PASS_GUARD_CLASS=BackwardPassGuard']
-    if found_ROCmBackward_Pass_Guard:
-        hipcc_args_mlp = hipcc_args_mlp + ['-DBACKWARD_PASS_GUARD'] + ['-DBACKWARD_PASS_GUARD_CLASS=ROCmBackwardPassGuard']
-
-    print ("INFO: Building the MLP Extension.")
-    ext_modules.append(
-        CUDAExtension(
-            name='mlp_cuda',
-            sources=[
-                'csrc/mlp.cpp',
-                'csrc/mlp_cuda.cu',
-            ],
-            include_dirs=[os.path.join(this_dir, 'csrc')],
-            extra_compile_args={
-                'cxx': ['-O3'] + version_dependent_macros,
-                'nvcc':['-O3'] + version_dependent_macros if not IS_ROCM_PYTORCH else hipcc_args_mlp,
-                }
-            )
-        )
-
-#**********  scaled_upper_triang_masked_softmax_cuda  ****************
-    nvcc_args_transformer = ['-O3',
-                             '-U__CUDA_NO_HALF_OPERATORS__',
-                             '-U__CUDA_NO_HALF_CONVERSIONS__',
-                             '--expt-relaxed-constexpr',
-                             '--expt-extended-lambda'] + version_dependent_macros
-    hipcc_args_transformer = ['-O3',
-                              '-U__CUDA_NO_HALF_OPERATORS__',
-                              '-U__CUDA_NO_HALF_CONVERSIONS__'] + version_dependent_macros
-
-    ext_modules.append(
-        CUDAExtension(
-            name='scaled_upper_triang_masked_softmax_cuda',
-            sources=[
-                 'csrc/megatron/scaled_upper_triang_masked_softmax_cpu.cpp',
-                 'csrc/megatron/scaled_upper_triang_masked_softmax_cuda.cu',
-             ],
-             include_dirs=[os.path.join(this_dir, 'csrc')],
-             extra_compile_args={
-                'cxx': ['-O3'] + version_dependent_macros,
-                'nvcc':nvcc_args_transformer if not IS_ROCM_PYTORCH else hipcc_args_transformer,
-                 }
-             )
-        )
-#*********** generic_scaled_masked_softmax_cuda   ****************
-    ext_modules.append(
-        CUDAExtension(
-            name="generic_scaled_masked_softmax_cuda",
-            sources=[
-                "csrc/megatron/generic_scaled_masked_softmax_cpu.cpp",
-                "csrc/megatron/generic_scaled_masked_softmax_cuda.cu",
-            ],
-            include_dirs=[os.path.join(this_dir, "csrc")],
-            extra_compile_args={
-                "cxx": ["-O3"] + version_dependent_macros,
-                "nvcc": nvcc_args_transformer if not IS_ROCM_PYTORCH else hipcc_args_transformer, 
-            },
-        )
-    )
-
-
-#*********** scaled_masked_softmax_cuda   ****************
-    ext_modules.append(
-        CUDAExtension(
-            name='scaled_masked_softmax_cuda',
-            sources=[
-                'csrc/megatron/scaled_masked_softmax_cpu.cpp',
-                'csrc/megatron/scaled_masked_softmax_cuda.cu',
-            ],
-            include_dirs=[os.path.join(this_dir, 'csrc'),
-                          os.path.join(this_dir, 'csrc/megatron')],
-            extra_compile_args={
-                'cxx': ['-O3'] + version_dependent_macros,
-                'nvcc':nvcc_args_transformer if not IS_ROCM_PYTORCH else hipcc_args_transformer,
-                }
-            )
-        )
-
-#***********  scaled_softmax_cuda   ****************
-    ext_modules.append(
-        CUDAExtension(
-            name="scaled_softmax_cuda",
-            sources=[
-                "csrc/megatron/scaled_softmax_cpu.cpp", 
-                "csrc/megatron/scaled_softmax_cuda.cu",
-            ],
-            include_dirs=[os.path.join(this_dir, "csrc")],
-            extra_compile_args={
-                "cxx": ["-O3"] + version_dependent_macros,
-                "nvcc":nvcc_args_transformer if not IS_ROCM_PYTORCH else hipcc_args_transformer,
-                }
-            )
-        )
-
-#***********  fused_rotary_positional_embedding   ****************
-    if IS_ROCM_PYTORCH and "--aiter" in sys.argv:
-        sys.argv.remove("--aiter")
-        subprocess.run(["pip", "install", "."], cwd = "third_party/aiter")
-
-    ext_modules.append(
-        CUDAExtension(
-            name="fused_rotary_positional_embedding",
-            sources=[
-                "csrc/megatron/fused_rotary_positional_embedding.cpp",
-                "csrc/megatron/fused_rotary_positional_embedding_cuda.cu",
-            ],
-            include_dirs=[os.path.join(this_dir, "csrc")],
-            extra_compile_args={
-                "cxx": ["-O3"] + version_dependent_macros,
-                "nvcc":nvcc_args_transformer if not IS_ROCM_PYTORCH else hipcc_args_transformer,
-                }
-            )
-        )
-
-#***********  fused_bias_swiglu   ****************
-    nvcc_args_swiglu = ['-O3',
-                        '-U__CUDA_NO_HALF_OPERATORS__',
-                        '-U__CUDA_NO_HALF_CONVERSIONS__',
-                        '--expt-relaxed-constexpr',
-                        '--expt-extended-lambda'] + version_dependent_macros
-    hipcc_args_swiglu = ['-O3',
-                        '-U__CUDA_NO_HALF_OPERATORS__',
-                        '-U__CUDA_NO_HALF_CONVERSIONS__'] + version_dependent_macros
-
-    if IS_ROCM_PYTORCH:
-        try:
-            amdgpu_targets = os.environ.get('PYTORCH_ROCM_ARCH', '')
-            if not amdgpu_targets:
-                print("Warning: PYTORCH_ROCM_ARCH environment variable is empty.")
-                print("Using default architecture. Set this variable for specific GPU targets.")
-                print("Example: export PYTORCH_ROCM_ARCH=gfx906")
-                amdgpu_targets = "gfx906"  # Default to a common architecture
-                
-            # Handle multiple architectures (separated by semicolons)
-            for amdgpu_target in amdgpu_targets.split(';'):
-                if amdgpu_target:  # Skip empty strings
-                    hipcc_args_swiglu += [f'--offload-arch={amdgpu_target}']
-        except Exception as e:
-            print(f"Warning: Error processing PYTORCH_ROCM_ARCH: {e}")
-            print("Falling back to default architecture gfx906")
-            hipcc_args_swiglu += ['--offload-arch=gfx906']
-
-
-    ext_modules.append(
-        CUDAExtension(
-            name="fused_bias_swiglu",
-            sources=[
-                "csrc/megatron/fused_bias_swiglu.cpp",
-                "csrc/megatron/fused_bias_swiglu_cuda.cu",
-            ],
-            include_dirs=[os.path.join(this_dir, "csrc")],
-            extra_compile_args={
-                "cxx": ["-O3"] + version_dependent_macros,
-                "nvcc": nvcc_args_swiglu if not IS_ROCM_PYTORCH else hipcc_args_swiglu,
-            }
-        )
-    )
-
-if "--bnp" in sys.argv or "--cuda_ext" in sys.argv:
-
-    if "--bnp" in sys.argv:
-        sys.argv.remove("--bnp")
+def is_env_set(key):
+    """
+    Checks if an environment variable is set and not "".
+    """
+    return bool(os.environ.get(key, None))
 
-    if torch.utils.cpp_extension.CUDA_HOME is None and not IS_ROCM_PYTORCH:
-        raise RuntimeError("--bnp was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
-    else:
-        ext_modules.append(
-            CUDAExtension(name='bnp',
-                          sources=['apex/contrib/csrc/groupbn/batch_norm.cu',
-                                   'apex/contrib/csrc/groupbn/ipc.cu',
-                                   'apex/contrib/csrc/groupbn/interface.cpp',
-                                   'apex/contrib/csrc/groupbn/batch_norm_add_relu.cu'],
-                          include_dirs=[os.path.join(this_dir, 'csrc'),
-                                        os.path.join(this_dir, 'apex/contrib/csrc/groupbn')],
-                          extra_compile_args={'cxx': [] + version_dependent_macros,
-                                              'nvcc':['-DCUDA_HAS_FP16=1',
-                                                      '-D__CUDA_NO_HALF_OPERATORS__',
-                                                      '-D__CUDA_NO_HALF_CONVERSIONS__',
-                                                      '-D__CUDA_NO_HALF2_OPERATORS__'] + version_dependent_macros}))
-
-if "--xentropy" in sys.argv or "--cuda_ext" in sys.argv:
-    if "--xentropy" in sys.argv:
-        sys.argv.remove("--xentropy")
-
-    if torch.utils.cpp_extension.CUDA_HOME is None and not IS_ROCM_PYTORCH:
-        raise RuntimeError("--xentropy was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
-    else:
-        print ("INFO: Building the xentropy extension.")
-        ext_modules.append(
-            CUDAExtension(name='xentropy_cuda',
-                          sources=['apex/contrib/csrc/xentropy/interface.cpp',
-                                   'apex/contrib/csrc/xentropy/xentropy_kernel.cu'],
-                          include_dirs=[os.path.join(this_dir, 'csrc'),
-                                        os.path.join(this_dir, 'apex/contrib/csrc/xentropy')],
-                          extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
-                                              'nvcc':['-O3'] + version_dependent_macros}))
-
-if "--focal_loss" in sys.argv or "--cuda_ext" in sys.argv:
-    if "--focal_loss" in sys.argv:
-        sys.argv.remove("--focal_loss")
-    ext_modules.append(
-        CUDAExtension(
-            name='focal_loss_cuda',
-            sources=[
-                'apex/contrib/csrc/focal_loss/focal_loss_cuda.cpp',
-                'apex/contrib/csrc/focal_loss/focal_loss_cuda_kernel.cu',
-            ],
-            include_dirs=[os.path.join(this_dir, 'csrc')],
-            extra_compile_args={
-                'cxx': ['-O3'] + version_dependent_macros,
-                'nvcc':(['-O3', '--use_fast_math', '--ftz=false'] if not IS_ROCM_PYTORCH else ['-O3']) + version_dependent_macros,
-            },
-        )
-    )
-
-if "--index_mul_2d" in sys.argv or "--cuda_ext" in sys.argv:
-    if "--index_mul_2d" in sys.argv:
-        sys.argv.remove("--index_mul_2d")
-
-    args_index_mul_2d = ['-O3']
-    if not IS_ROCM_PYTORCH:
-        args_index_mul_2d += ['--use_fast_math', '--ftz=false']
-    if found_aten_atomic_header:
-        args_index_mul_2d += ['-DATEN_ATOMIC_HEADER']
-
-    ext_modules.append(
-        CUDAExtension(
-            name='fused_index_mul_2d',
-            sources=[
-                'apex/contrib/csrc/index_mul_2d/index_mul_2d_cuda.cpp',
-                'apex/contrib/csrc/index_mul_2d/index_mul_2d_cuda_kernel.cu',
-            ],
-            include_dirs=[os.path.join(this_dir, 'csrc')],
-            extra_compile_args={
-                'cxx': ['-O3'] + version_dependent_macros,
-                'nvcc': args_index_mul_2d + version_dependent_macros,
-            },
-        )
-    )
-
-if "--deprecated_fused_adam" in sys.argv or "--cuda_ext" in sys.argv:
-    if "--deprecated_fused_adam" in sys.argv:
-        sys.argv.remove("--deprecated_fused_adam")
-
-    if torch.utils.cpp_extension.CUDA_HOME is None and not IS_ROCM_PYTORCH:
-        raise RuntimeError("--deprecated_fused_adam was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
-    else:
-        print ("INFO: Building deprecated fused adam extension.")
-        nvcc_args_fused_adam = ['-O3', '--use_fast_math'] + version_dependent_macros
-        hipcc_args_fused_adam = ['-O3'] + version_dependent_macros
-        ext_modules.append(
-            CUDAExtension(name='fused_adam_cuda',
-                          sources=['apex/contrib/csrc/optimizers/fused_adam_cuda.cpp',
-                                   'apex/contrib/csrc/optimizers/fused_adam_cuda_kernel.cu'],
-                          include_dirs=[os.path.join(this_dir, 'csrc'),
-                                        os.path.join(this_dir, 'apex/contrib/csrc/optimizers')],
-                          extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
-                                              'nvcc' : nvcc_args_fused_adam if not IS_ROCM_PYTORCH else hipcc_args_fused_adam}))
-
-if "--deprecated_fused_lamb" in sys.argv or "--cuda_ext" in sys.argv:
-    if "--deprecated_fused_lamb" in sys.argv:
-        sys.argv.remove("--deprecated_fused_lamb")
-
-    if torch.utils.cpp_extension.CUDA_HOME is None and not IS_ROCM_PYTORCH:
-        raise RuntimeError("--deprecated_fused_lamb was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
-    else:
-        print ("INFO: Building deprecated fused lamb extension.")
-        nvcc_args_fused_lamb = ['-O3', '--use_fast_math'] + version_dependent_macros
-        hipcc_args_fused_lamb = ['-O3'] + version_dependent_macros
-        ext_modules.append(
-            CUDAExtension(name='fused_lamb_cuda',
-                          sources=['apex/contrib/csrc/optimizers/fused_lamb_cuda.cpp',
-                                   'apex/contrib/csrc/optimizers/fused_lamb_cuda_kernel.cu',
-                                   'csrc/multi_tensor_l2norm_kernel.cu'],
-                          include_dirs=[os.path.join(this_dir, 'csrc')],
-                          extra_compile_args = nvcc_args_fused_lamb if not IS_ROCM_PYTORCH else hipcc_args_fused_lamb))
-
-# Check, if ATen/CUDAGeneratorImpl.h is found, otherwise use ATen/cuda/CUDAGeneratorImpl.h
-# See https://github.com/pytorch/pytorch/pull/70650
-generator_flag = []
-torch_dir = torch.__path__[0]
-if os.path.exists(os.path.join(torch_dir, "include", "ATen", "CUDAGeneratorImpl.h")):
-    generator_flag = ["-DOLD_GENERATOR_PATH"]
-
-if "--fast_layer_norm" in sys.argv:
-    sys.argv.remove("--fast_layer_norm")
-    raise_if_cuda_home_none("--fast_layer_norm")
-    # Check, if CUDA11 is installed for compute capability 8.0
-    cc_flag = []
-    _, bare_metal_major, _ = get_cuda_bare_metal_version(CUDA_HOME)
-    if int(bare_metal_major) >= 11:
-        cc_flag.append("-gencode")
-        cc_flag.append("arch=compute_80,code=sm_80")
-
-    if CUDA_HOME is None and not IS_ROCM_PYTORCH:
-        raise RuntimeError("--fast_layer_norm was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
-    else:
-        # Check, if CUDA11 is installed for compute capability 8.0
-        cc_flag = []
-        _, bare_metal_major, _ = get_cuda_bare_metal_version(CUDA_HOME)
-        if int(bare_metal_major) >= 11:
-            cc_flag.append('-gencode')
-            cc_flag.append('arch=compute_80,code=sm_80')
-
-if "--fmha" in sys.argv:
-    sys.argv.remove("--fmha")
-    raise_if_cuda_home_none("--fmha")
-    # Check, if CUDA11 is installed for compute capability 8.0
-    cc_flag = []
-    _, bare_metal_major, _ = get_cuda_bare_metal_version(CUDA_HOME)
-    if int(bare_metal_major) < 11:
-        raise RuntimeError("--fmha only supported on SM80")
-    cc_flag.append("-gencode")
-    cc_flag.append("arch=compute_80,code=sm_80")
-
-    if CUDA_HOME is None and not IS_ROCM_PYTORCH:
-        raise RuntimeError("--fmha was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
-    else:
-        # Check, if CUDA11 is installed for compute capability 8.0
-        cc_flag = []
-        _, bare_metal_major, _ = get_cuda_bare_metal_version(CUDA_HOME)
-        if int(bare_metal_major) < 11:
-            raise RuntimeError("--fmha only supported on SM80")
-
-        ext_modules.append(
-            CUDAExtension(name='fmhalib',
-                          sources=[
-                                   'apex/contrib/csrc/fmha/fmha_api.cpp',
-                                   'apex/contrib/csrc/fmha/src/fmha_noloop_reduce.cu',
-                                   'apex/contrib/csrc/fmha/src/fmha_fprop_fp16_128_64_kernel.sm80.cu',
-                                   'apex/contrib/csrc/fmha/src/fmha_fprop_fp16_256_64_kernel.sm80.cu',
-                                   'apex/contrib/csrc/fmha/src/fmha_fprop_fp16_384_64_kernel.sm80.cu',
-                                   'apex/contrib/csrc/fmha/src/fmha_fprop_fp16_512_64_kernel.sm80.cu',
-                                   'apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_128_64_kernel.sm80.cu',
-                                   'apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_256_64_kernel.sm80.cu',
-                                   'apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_384_64_kernel.sm80.cu',
-                                   'apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_512_64_kernel.sm80.cu',
-                                   ],
-                          extra_compile_args={'cxx': ['-O3',
-                                                      ] + version_dependent_macros + generator_flag,
-                                              'nvcc':['-O3',
-                                                      '-gencode', 'arch=compute_80,code=sm_80',
-                                                      '-U__CUDA_NO_HALF_OPERATORS__',
-                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
-                                                      '--expt-relaxed-constexpr',
-                                                      '--expt-extended-lambda',
-                                                      '--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag},
-                          include_dirs=[os.path.join(this_dir, "apex/contrib/csrc"), os.path.join(this_dir, "apex/contrib/csrc/fmha/src")]))
-
-
-if "--fast_multihead_attn" in sys.argv or "--cuda_ext" in sys.argv:
-    if "--fast_multihead_attn" in sys.argv:
-        sys.argv.remove("--fast_multihead_attn")
-
-    if torch.utils.cpp_extension.CUDA_HOME is None and not IS_ROCM_PYTORCH:
-        raise RuntimeError("--fast_multihead_attn was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
-    else:
-        # Check, if CUDA11 is installed for compute capability 8.0
-        cc_flag = []
-        if not IS_ROCM_PYTORCH:
-            _, bare_metal_major, _ = get_cuda_bare_metal_version(torch.utils.cpp_extension.CUDA_HOME)
-            if int(bare_metal_major) >= 11:
-                cc_flag.append('-gencode')
-                cc_flag.append('arch=compute_80,code=sm_80')
-                cc_flag.append('-gencode')
-                cc_flag.append('arch=compute_86,code=sm_86')
-
-        subprocess.run(["git", "submodule", "update", "--init", "apex/contrib/csrc/multihead_attn/cutlass"])
-        nvcc_args_mha = ['-O3',
-                         '-gencode',
-                         'arch=compute_70,code=sm_70',
-                         '-Iapex/contrib/csrc/multihead_attn/cutlass',
-                         '-U__CUDA_NO_HALF_OPERATORS__',
-                         '-U__CUDA_NO_HALF_CONVERSIONS__',
-                         '--expt-relaxed-constexpr',
-                         '--expt-extended-lambda',
-                         '--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag
-        hipcc_args_mha = ['-O3',
-                          '-Iapex/contrib/csrc/multihead_attn/cutlass',
-                          '-I/opt/rocm/include/hiprand',
-                          '-I/opt/rocm/include/rocrand',
-                          '-U__HIP_NO_HALF_OPERATORS__',
-                          '-U__HIP_NO_HALF_CONVERSIONS__'] + version_dependent_macros + generator_flag
-        if found_Backward_Pass_Guard:
-            hipcc_args_mha = hipcc_args_mha + ['-DBACKWARD_PASS_GUARD'] + ['-DBACKWARD_PASS_GUARD_CLASS=BackwardPassGuard']
-        if found_ROCmBackward_Pass_Guard:
-            hipcc_args_mha = hipcc_args_mha + ['-DBACKWARD_PASS_GUARD'] + ['-DBACKWARD_PASS_GUARD_CLASS=ROCmBackwardPassGuard']
-
-        ext_modules.append(
-            CUDAExtension(
-                name='fast_multihead_attn',
-                sources=[
-                    'apex/contrib/csrc/multihead_attn/multihead_attn_frontend.cpp',
-                    'apex/contrib/csrc/multihead_attn/additive_masked_softmax_dropout_cuda.cu',
-                    "apex/contrib/csrc/multihead_attn/masked_softmax_dropout_cuda.cu",
-                    "apex/contrib/csrc/multihead_attn/encdec_multihead_attn_cuda.cu",
-                    "apex/contrib/csrc/multihead_attn/encdec_multihead_attn_norm_add_cuda.cu",
-                    "apex/contrib/csrc/multihead_attn/self_multihead_attn_cuda.cu",
-                    "apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_additive_mask_cuda.cu",
-                    "apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_cuda.cu",
-                    "apex/contrib/csrc/multihead_attn/self_multihead_attn_norm_add_cuda.cu",
-                ],
-                include_dirs=[os.path.join(this_dir, 'csrc'),
-                                        os.path.join(this_dir, 'apex/contrib/csrc/multihead_attn')],
-                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros + generator_flag,
-                                              'nvcc':nvcc_args_mha if not IS_ROCM_PYTORCH else hipcc_args_mha}
-            )
-        )
-
-if "--transducer" in sys.argv or "--cuda_ext" in sys.argv:
-    if "--transducer" in sys.argv:
-        sys.argv.remove("--transducer")
-    
-    if not IS_ROCM_PYTORCH:
-        raise_if_cuda_home_none("--transducer")
-
-    ext_modules.append(
-        CUDAExtension(
-            name="transducer_joint_cuda",
-            sources=[
-                "apex/contrib/csrc/transducer/transducer_joint.cpp",
-                "apex/contrib/csrc/transducer/transducer_joint_kernel.cu",
-            ],
-            extra_compile_args={
-                "cxx": ["-O3"] + version_dependent_macros + generator_flag,
-                "nvcc": append_nvcc_threads(["-O3"] + version_dependent_macros + generator_flag) if not IS_ROCM_PYTORCH
-                        else ["-O3"] + version_dependent_macros + generator_flag,
-            },
-            include_dirs=[os.path.join(this_dir, "csrc"), os.path.join(this_dir, "apex/contrib/csrc/multihead_attn")],
-        )
-    )
-    ext_modules.append(
-        CUDAExtension(
-            name="transducer_loss_cuda",
-            sources=[
-                "apex/contrib/csrc/transducer/transducer_loss.cpp",
-                "apex/contrib/csrc/transducer/transducer_loss_kernel.cu",
-            ],
-            include_dirs=[os.path.join(this_dir, "csrc")],
-            extra_compile_args={
-                "cxx": ["-O3"] + version_dependent_macros,
-                "nvcc": append_nvcc_threads(["-O3"] + version_dependent_macros) if not IS_ROCM_PYTORCH
-                        else ["-O3"] + version_dependent_macros,
-            },
-        )
-    )
+def get_op_build_env_name(op_name):
+    assert hasattr(ALL_OPS[op_name], 'BUILD_VAR'), \
+        f"{op_name} is missing BUILD_VAR field"
+    return ALL_OPS[op_name].BUILD_VAR
 
-# note (mkozuki): Now `--fast_bottleneck` option (i.e. apex/contrib/bottleneck) depends on `--peer_memory` and `--nccl_p2p`.
-if "--fast_bottleneck" in sys.argv:
-    sys.argv.remove("--fast_bottleneck")
-    raise_if_cuda_home_none("--fast_bottleneck")
-    if check_cudnn_version_and_warn("--fast_bottleneck", 8400):
-        subprocess.run(["git", "submodule", "update", "--init", "apex/contrib/csrc/cudnn-frontend/"])
-        ext_modules.append(
-            CUDAExtension(
-                name="fast_bottleneck",
-                sources=["apex/contrib/csrc/bottleneck/bottleneck.cpp"],
-                include_dirs=[os.path.join(this_dir, "apex/contrib/csrc/cudnn-frontend/include")],
-                extra_compile_args={"cxx": ["-O3"] + version_dependent_macros + generator_flag},
-            )
-        )
-
-if "--peer_memory" in sys.argv or "--cuda_ext" in sys.argv:
-    if "--peer_memory" in sys.argv:
-        sys.argv.remove("--peer_memory")
-
-    if not IS_ROCM_PYTORCH:
-        raise_if_cuda_home_none("--peer_memory")
-
-    ext_modules.append(
-        CUDAExtension(
-            name="peer_memory_cuda",
-            sources=[
-                "apex/contrib/csrc/peer_memory/peer_memory_cuda.cu",
-                "apex/contrib/csrc/peer_memory/peer_memory.cpp",
-            ],
-            extra_compile_args={"cxx": ["-O3"] + version_dependent_macros + generator_flag},
-        )
-    )
 
-if "--nccl_p2p" in sys.argv or "--cuda_ext" in sys.argv:
-    if "--nccl_p2p" in sys.argv:
-        sys.argv.remove("--nccl_p2p")
-
-    if not IS_ROCM_PYTORCH:
-        raise_if_cuda_home_none("--nccl_p2p")
-
-    ext_modules.append(
-        CUDAExtension(
-            name="nccl_p2p_cuda",
-            sources=[
-                "apex/contrib/csrc/nccl_p2p/nccl_p2p_cuda.cu",
-                "apex/contrib/csrc/nccl_p2p/nccl_p2p.cpp",
-            ],
-            extra_compile_args={"cxx": ["-O3"] + version_dependent_macros + generator_flag},
-        )
-    )
+def op_build_enabled(op_name):
+    env_var = get_op_build_env_name(op_name)
+    return int(get_env_if_set(env_var, BUILD_OP_DEFAULT))
 
+def is_op_build_included(op_name):
+    #check if operation has BUILD_FLAG defined
+    assert hasattr(ALL_OPS[op_name], 'INCLUDE_FLAG'), \
+        f"{op_name} is missing INCLUDE_FLAG field"
+    include_flag = ALL_OPS[op_name].INCLUDE_FLAG
+    return get_env_if_set(include_flag, False)
 
-if "--fused_conv_bias_relu" in sys.argv:
-    sys.argv.remove("--fused_conv_bias_relu")
-    raise_if_cuda_home_none("--fused_conv_bias_relu")
-    if check_cudnn_version_and_warn("--fused_conv_bias_relu", 8400):
-        subprocess.run(["git", "submodule", "update", "--init", "apex/contrib/csrc/cudnn-frontend/"])
-        ext_modules.append(
-            CUDAExtension(
-                name="fused_conv_bias_relu",
-                sources=["apex/contrib/csrc/conv_bias_relu/conv_bias_relu.cpp"],
-                include_dirs=[os.path.join(this_dir, "apex/contrib/csrc/cudnn-frontend/include")],
-                extra_compile_args={"cxx": ["-O3"] + version_dependent_macros + generator_flag},
-            )
-        )
-
-#NCCL allocator is supported for apex 1.6 version and onwards
-if TORCH_MAJOR == 2 and TORCH_MINOR >= 6:
-    if "--nccl_allocator" in sys.argv or "--cuda_ext" in sys.argv:
-        if "--nccl_allocator" in sys.argv:
-            sys.argv.remove("--nccl_allocator")
-        raise_if_cuda_home_none("--nccl_allocator")
-        _nccl_version_getter = load(
-            name="_nccl_version_getter",
-            sources=["apex/contrib/csrc/nccl_p2p/nccl_version.cpp", "apex/contrib/csrc/nccl_p2p/nccl_version_check.cu"],
-        )
-        ccl_library = ["nccl"]
-        if IS_ROCM_PYTORCH:
-            ccl_library = ["rccl"]
-        _available_nccl_version = _nccl_version_getter.get_nccl_version()
-        if _available_nccl_version >= (2, 19):
-            ext_modules.append(
-                CUDAExtension(
-                    name="_apex_nccl_allocator",
-                    sources=[
-                        "apex/contrib/csrc/nccl_allocator/NCCLAllocator.cpp",
-                    ],
-                    include_dirs=[os.path.join(this_dir, "apex/apex/contrib/csrc/nccl_allocator")],
-                    libraries=ccl_library,
-                    extra_compile_args={"cxx": ["-O3"] + version_dependent_macros + generator_flag},
-                )
-            )
+ext_modules = []
+install_ops = dict.fromkeys(ALL_OPS.keys(), False)
+
+for op_name, builder in ALL_OPS.items():
+    op_compatible = builder.is_compatible()
+    build_enabled = op_build_enabled(op_name) or is_op_build_included(op_name)
+
+    # If op is requested but not available, throw an error.
+    if build_enabled and not op_compatible:
+        env_var = get_op_build_env_name(op_name)
+        builder.warning(f"Skip pre-compile of incompatible {op_name}; One can disable {op_name} with {env_var}=0")
+        continue
+
+    # If op is compatible but install is not build enabled (JIT mode).
+    if IS_ROCM_PYTORCH and op_compatible and not build_enabled:
+        builder.hipify_extension()
+
+    # If op build enabled, add builder to extensions.
+    # Also check if corresponding flags are checked
+    if build_enabled and op_compatible:
+        install_ops[op_name] = True
+        ext_modules.append(builder.builder())
+
+print(f'Install Ops={install_ops}')  
+
+# Write out version/git info.
+git_hash_cmd = shlex.split("bash -c \"git rev-parse --short HEAD\"")
+git_branch_cmd = shlex.split("bash -c \"git rev-parse --abbrev-ref HEAD\"")
+if command_exists('git') and not is_env_set('APEX_BUILD_STRING'):
+    try:
+        result = subprocess.check_output(git_hash_cmd)
+        git_hash = result.decode('utf-8').strip()
+        result = subprocess.check_output(git_branch_cmd)
+        git_branch = result.decode('utf-8').strip()
+    except subprocess.CalledProcessError:
+        git_hash = "unknown"
+        git_branch = "unknown"
+else:
+    git_hash = "unknown"
+    git_branch = "unknown"
+
+# Parse the apex version string from version.txt.
+version_str = get_apex_version()
+version_str += f'+{git_hash}'
+
+torch_version = ".".join([str(TORCH_MAJOR), str(TORCH_MINOR)])
+bf16_support = False
+# Set cuda_version to 0.0 if cpu-only.
+cuda_version = "0.0"
+nccl_version = "0.0"
+# Set hip_version to 0.0 if cpu-only.
+hip_version = "0.0"
+if torch.version.cuda is not None:
+    cuda_version = ".".join(torch.version.cuda.split('.')[:2])
+    if sys.platform != "win32":
+        if isinstance(torch.cuda.nccl.version(), int):
+            # This will break if minor version > 9.
+            nccl_version = ".".join(str(torch.cuda.nccl.version())[:2])
         else:
-            warnings.warn(
-                f"Skip `--nccl_allocator` as it requires NCCL 2.19 or later, but {_available_nccl_version[0]}.{_available_nccl_version[1]}"
-            )
-
+            nccl_version = ".".join(map(str, torch.cuda.nccl.version()[:2]))
+    if hasattr(torch.cuda, 'is_bf16_supported') and torch.cuda.is_available():
+        bf16_support = torch.cuda.is_bf16_supported()
+if hasattr(torch.version, 'hip') and torch.version.hip is not None:
+    hip_version = ".".join(torch.version.hip.split('.')[:2])
+torch_info = {
+    "version": torch_version,
+    "bf16_support": bf16_support,
+    "cuda_version": cuda_version,
+    "nccl_version": nccl_version,
+    "hip_version": hip_version
+}
+
+print(f"version={version_str}, git_hash={git_hash}, git_branch={git_branch}")
+with open('apex/git_version_info_installed.py', 'w') as fd:
+    fd.write(f"version='{version_str}'\n")
+    fd.write(f"git_hash='{git_hash}'\n")
+    fd.write(f"git_branch='{git_branch}'\n")
+    fd.write(f"installed_ops={install_ops}\n")
+    fd.write(f"build_flags={build_flags}\n")
+    fd.write(f"torch_info={torch_info}\n")
 
+if "--cpp_ext" in sys.argv:
+    sys.argv.remove("--cpp_ext")
 
 if "--cuda_ext" in sys.argv:
     sys.argv.remove("--cuda_ext")
@@ -982,16 +269,46 @@ def check_if_rocm_pytorch():
 with open('requirements.txt') as f:
     required = f.read().splitlines()
 
+# Find python files in compatibility folder
+compatibility_dir = os.path.join(this_dir, 'compatibility')
+py_modules = []
+
+if os.path.exists(compatibility_dir):
+    for file in os.listdir(compatibility_dir):
+        if file.endswith('.py') and file != '__init__.py':
+            module_name = f"{file[:-3]}"
+            py_modules.append(module_name)
+
+            #copy outside temporarily
+            src_file = os.path.join(compatibility_dir, file)
+            dst_file = os.path.join(this_dir, file)
+            shutil.copy2(src_file, dst_file)
+else:
+    print("Warning: compatibility folder not found")
+
+class BinaryDistribution(Distribution):
+    """Force wheel to be platform-specific even without ext_modules."""
+    def has_ext_modules(self):
+        return True
+
 setup(
     name="apex",
     version=get_apex_version(),
     packages=find_packages(
-        exclude=("build", "csrc", "include", "tests", "dist", "docs", "tests", "examples", "apex.egg-info",)
+        exclude=("build", "include", "tests", "dist", "docs", "tests", "examples", "apex.egg-info", "op_builder", "compatibility")
     ),
     description="PyTorch Extensions written by NVIDIA",
     ext_modules=ext_modules,
     cmdclass={'build_ext': BuildExtension} if ext_modules else {},
     extras_require=extras,
-    install_requires=required
+    install_requires=required,
+    include_package_data=True,
+    py_modules=py_modules,
+    distclass=BinaryDistribution
 )
 
+#delete the temporarily copied compatibility files
+for py_module in py_modules:
+    path = dst_file = os.path.join(this_dir, py_module + ".py")
+    if os.path.exists(path):
+        os.remove(path)
\ No newline at end of file
diff --git a/tests/jit_build/build.sh b/tests/jit_build/build.sh
new file mode 100644
index 000000000..1cb09af96
--- /dev/null
+++ b/tests/jit_build/build.sh
@@ -0,0 +1,62 @@
+#parse the arguments
+JIT_CONDITION="$2"
+echo "JIT_CONDITION $JIT_CONDITION"
+
+echo $(pwd)
+
+git checkout Refactor_build
+git submodule update --init --recursive 
+
+# uninstall apex 
+pip uninstall apex -y
+make clean
+
+#install apex for different conditions 
+if [ "$JIT_CONDITION" = "1" ]; then
+    pip install . --no-build-isolation 
+elif [ "$JIT_CONDITION" = "2" ]; then
+    APEX_BUILD_CPP_OPS=1 pip install . --no-build-isolation
+elif [ "$JIT_CONDITION" = "3" ]; then
+    APEX_BUILD_CUDA_OPS=1 pip install . --no-build-isolation
+elif [ "$JIT_CONDITION" = "4" ]; then
+    APEX_BUILD_CPP_OPS=1 APEX_BUILD_CUDA_OPS=1 pip install . --no-build-isolation
+elif [ "$JIT_CONDITION" = "5" ]; then
+    APEX_BUILD_FUSED_DENSE=1 pip install . --no-build-isolation
+elif [ "$JIT_CONDITION" = "6" ]; then
+    python setup.py install --cpp_ext --cuda_ext
+elif [ "$JIT_CONDITION" = "7" ]; then
+    APEX_BUILD_AMP_C=1 APEX_BUILD_APEX_C=1 APEX_BUILD_BNP=1 \
+    APEX_BUILD_DISTRIBUTED_ADAM=1 APEX_BUILD_DISTRIBUTED_LAMB=1 APEX_BUILD_FAST_MULTIHEAD_ATTN=1 \
+    APEX_BUILD_FOCAL_LOSS=1 APEX_BUILD_FUSED_ADAM=1 APEX_BUILD_FUSED_BIAS_SWIGLU=1 \
+    APEX_BUILD_FUSED_DENSE=1 APEX_BUILD_FUSED_INDEX_MUL_2D=1 APEX_BUILD_FUSED_LAMB=1 \
+    APEX_BUILD_FUSED_LAYER_NORM=1 APEX_BUILD_FUSED_ROPE=1 APEX_BUILD_FUSED_WEIGHT_GRADIENT_MLP=1 \
+    APEX_BUILD_GENERIC_SCALED_MASKED_SOFTMAX_CUDA=1 APEX_BUILD_MLP=1 APEX_BUILD_NCCL_ALLOCATOR=1 \
+    APEX_BUILD_NCCL_P2P=1 APEX_BUILD_PEER_MEMORY=1 APEX_BUILD_SCALED_MASKED_SOFTMAX_CUDA=1 \
+    APEX_BUILD_SCALED_SOFTMAX_CUDA=1 APEX_BUILD_SCALED_UPPER_TRIANG_MASKED_SOFTMAX_CUDA=1 APEX_BUILD_SYNCBN=1 \
+    APEX_BUILD_TRANSDUCER_JOINT=1 APEX_BUILD_TRANSDUCER_LOSS=1 APEX_BUILD_XENTROPY=1 pip install . --no-build-isolation
+elif [ "$JIT_CONDITION" = "8" ]; then
+    python -m build --wheel --no-isolation .
+    pip install dist/apex-*.whl 
+elif [ "$JIT_CONDITION" = "9" ]; then
+    APEX_BUILD_CPP_OPS=1 python -m build --wheel --no-isolation .
+elif [ "$JIT_CONDITION" = "10" ]; then
+    APEX_BUILD_CUDA_OPS=1 python -m build --wheel --no-isolation .
+    pip install dist/apex-*.whl 
+elif [ "$JIT_CONDITION" = "11" ]; then
+    APEX_BUILD_CPP_OPS=1 APEX_BUILD_CUDA_OPS=1 python -m build --wheel --no-isolation .
+    pip install dist/apex-*.whl 
+elif [ "$JIT_CONDITION" = "12" ]; then
+    APEX_BUILD_FUSED_DENSE=1 python -m build --wheel --no-isolation .
+    pip install dist/apex-*.whl 
+elif [ "$JIT_CONDITION" = "13" ]; then
+    APEX_BUILD_AMP_C=1 APEX_BUILD_APEX_C=1 APEX_BUILD_BNP=1 \
+    APEX_BUILD_DISTRIBUTED_ADAM=1 APEX_BUILD_DISTRIBUTED_LAMB=1 APEX_BUILD_FAST_MULTIHEAD_ATTN=1 \
+    APEX_BUILD_FOCAL_LOSS=1 APEX_BUILD_FUSED_ADAM=1 APEX_BUILD_FUSED_BIAS_SWIGLU=1 \
+    APEX_BUILD_FUSED_DENSE=1 APEX_BUILD_FUSED_INDEX_MUL_2D=1 APEX_BUILD_FUSED_LAMB=1 \
+    APEX_BUILD_FUSED_LAYER_NORM=1 APEX_BUILD_FUSED_ROPE=1 APEX_BUILD_FUSED_WEIGHT_GRADIENT_MLP=1 \
+    APEX_BUILD_GENERIC_SCALED_MASKED_SOFTMAX_CUDA=1 APEX_BUILD_MLP=1 APEX_BUILD_NCCL_ALLOCATOR=1 \
+    APEX_BUILD_NCCL_P2P=1 APEX_BUILD_PEER_MEMORY=1 APEX_BUILD_SCALED_MASKED_SOFTMAX_CUDA=1 \
+    APEX_BUILD_SCALED_SOFTMAX_CUDA=1 APEX_BUILD_SCALED_UPPER_TRIANG_MASKED_SOFTMAX_CUDA=1 APEX_BUILD_SYNCBN=1 \
+    APEX_BUILD_TRANSDUCER_JOINT=1 APEX_BUILD_TRANSDUCER_LOSS=1 APEX_BUILD_XENTROPY=1 python -m build --wheel --no-isolation .
+    pip install dist/apex-*.whl 
+fi 
\ No newline at end of file
diff --git a/tests/jit_build/build_test.sh b/tests/jit_build/build_test.sh
new file mode 100644
index 000000000..5e61b696c
--- /dev/null
+++ b/tests/jit_build/build_test.sh
@@ -0,0 +1,5 @@
+#parse the arguments
+JIT_CONDITION="$2"
+
+sh tests/jit_build/build.sh "condition" $JIT_CONDITION
+sh tests/jit_build/run_tests.sh "condition" $JIT_CONDITION
\ No newline at end of file
diff --git a/tests/jit_build/count_built_so.py b/tests/jit_build/count_built_so.py
new file mode 100644
index 000000000..353034acb
--- /dev/null
+++ b/tests/jit_build/count_built_so.py
@@ -0,0 +1,11 @@
+import glob
+import os
+import site
+
+
+SITE_PACKAGES_FOLDERS = site.getsitepackages()[0]
+
+#count the number of *.so files in the folder
+so_files = glob.glob(os.path.join(SITE_PACKAGES_FOLDERS, "apex/*.so"), recursive=True)
+count = len(so_files)
+print(count)
diff --git a/tests/jit_build/count_failed_unit_tests.py b/tests/jit_build/count_failed_unit_tests.py
new file mode 100644
index 000000000..c6d95d3ea
--- /dev/null
+++ b/tests/jit_build/count_failed_unit_tests.py
@@ -0,0 +1,16 @@
+import sys 
+
+test_file = sys.argv[1]
+
+#read lines from test file
+with open(test_file, "r") as f:
+    lines = f.readlines()
+
+failed_tests = []
+for line in lines:
+    if "ERROR: " in line:
+        failed_tests.append(line[7:].strip())
+    if " FAILED" in line and "#" not in line:
+        failed_tests.append(line[: -8].strip())
+print(len(failed_tests))
+#print(str(len(failed_tests)) + "," + ";".join(failed_tests)) 
\ No newline at end of file
diff --git a/tests/jit_build/count_torch_extensions.py b/tests/jit_build/count_torch_extensions.py
new file mode 100644
index 000000000..3c8a9fda3
--- /dev/null
+++ b/tests/jit_build/count_torch_extensions.py
@@ -0,0 +1,9 @@
+import os
+
+import torch.utils.cpp_extension
+
+torch_ext_directory = torch.utils.cpp_extension._get_build_directory("", False)
+#count the number of folders
+folders = [f for f in os.listdir(torch_ext_directory) if os.path.isdir(os.path.join(torch_ext_directory, f))]
+count = len(folders)
+print(count)
\ No newline at end of file
diff --git a/tests/jit_build/docker/base.ubuntu.amd.Dockerfile b/tests/jit_build/docker/base.ubuntu.amd.Dockerfile
new file mode 100644
index 000000000..b825ba05e
--- /dev/null
+++ b/tests/jit_build/docker/base.ubuntu.amd.Dockerfile
@@ -0,0 +1,3 @@
+# CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'}
+ARG BASE_DOCKER=rocm/pytorch
+FROM $BASE_DOCKER
\ No newline at end of file
diff --git a/tests/jit_build/load_extra_extensions.py b/tests/jit_build/load_extra_extensions.py
new file mode 100644
index 000000000..16d25d2f8
--- /dev/null
+++ b/tests/jit_build/load_extra_extensions.py
@@ -0,0 +1,16 @@
+from apex.op_builder.fused_lamb import FusedLambBuilder
+from apex.op_builder.generic_scaled_masked_softmax_cuda import GenericScaledMaskedSoftmaxCudaBuilder
+from apex.op_builder.scaled_softmax_cuda import ScaledSoftmaxCudaBuilder
+from apex.op_builder.nccl_p2p import NCCLP2PBuilder
+
+'''
+generic_scaled_masked_softmax_cuda
+scaled_softmax_cuda
+fused_lamb_cuda
+nccl_p2p_cuda
+'''
+
+FusedLambBuilder().load()
+GenericScaledMaskedSoftmaxCudaBuilder().load()
+ScaledSoftmaxCudaBuilder().load()
+NCCLP2PBuilder().load()
\ No newline at end of file
diff --git a/tests/jit_build/models.json b/tests/jit_build/models.json
new file mode 100644
index 000000000..72963295b
--- /dev/null
+++ b/tests/jit_build/models.json
@@ -0,0 +1,158 @@
+[
+    {
+        "name": "apex_jit_install_condition1",
+        "dockerfile": "docker/base",
+        "scripts": "scripts",
+        "n_gpus": "8",
+        "owner": "skishore@amd.com",
+        "training_precision": "",
+        "tags": [
+          "apex_jit"
+        ],
+        "args": "--condition 1"
+    },
+    {
+        "name": "apex_jit_install_condition2",
+        "dockerfile": "docker/base",
+        "scripts": "scripts",
+        "n_gpus": "8",
+        "owner": "skishore@amd.com",
+        "training_precision": "",
+        "tags": [
+          "apex_jit"
+        ],
+        "args": "--condition 2"
+    },
+    {
+        "name": "apex_jit_install_condition3",
+        "dockerfile": "docker/base",
+        "scripts": "scripts",
+        "n_gpus": "8",
+        "owner": "skishore@amd.com",
+        "training_precision": "",
+        "tags": [
+          "apex_jit"
+        ],
+        "args": "--condition 3"
+    },
+    {
+        "name": "apex_jit_install_condition4",
+        "dockerfile": "docker/base",
+        "scripts": "scripts",
+        "n_gpus": "8",
+        "owner": "skishore@amd.com",
+        "training_precision": "",
+        "tags": [
+          "apex_jit"
+        ],
+        "args": "--condition 4"
+    },
+    {
+        "name": "apex_jit_install_condition5",
+        "dockerfile": "docker/base",
+        "scripts": "scripts",
+        "n_gpus": "8",
+        "owner": "skishore@amd.com",
+        "training_precision": "",
+        "tags": [
+          "apex_jit"
+        ],
+        "args": "--condition 5"
+    },
+    {
+        "name": "apex_jit_install_condition6",
+        "dockerfile": "docker/base",
+        "scripts": "scripts",
+        "n_gpus": "8",
+        "owner": "skishore@amd.com",
+        "training_precision": "",
+        "tags": [
+          "apex_jit"
+        ],
+        "args": "--condition 6"
+    },
+    {
+        "name": "apex_jit_install_condition7",
+        "dockerfile": "docker/base",
+        "scripts": "scripts",
+        "n_gpus": "8",
+        "owner": "skishore@amd.com",
+        "training_precision": "",
+        "tags": [
+          "apex_jit"
+        ],
+        "args": "--condition 7"
+    },
+    {
+        "name": "apex_jit_install_condition8",
+        "dockerfile": "docker/base",
+        "scripts": "scripts",
+        "n_gpus": "8",
+        "owner": "skishore@amd.com",
+        "training_precision": "",
+        "tags": [
+          "apex_jit"
+        ],
+        "args": "--condition 8"
+    },
+    {
+        "name": "apex_jit_install_condition9",
+        "dockerfile": "docker/base",
+        "scripts": "scripts",
+        "n_gpus": "8",
+        "owner": "skishore@amd.com",
+        "training_precision": "",
+        "tags": [
+          "apex_jit"
+        ],
+        "args": "--condition 9"
+    },
+    {
+        "name": "apex_jit_install_condition10",
+        "dockerfile": "docker/base",
+        "scripts": "scripts",
+        "n_gpus": "8",
+        "owner": "skishore@amd.com",
+        "training_precision": "",
+        "tags": [
+          "apex_jit"
+        ],
+        "args": "--condition 10"
+    },
+    {
+        "name": "apex_jit_install_condition11",
+        "dockerfile": "docker/base",
+        "scripts": "scripts",
+        "n_gpus": "8",
+        "owner": "skishore@amd.com",
+        "training_precision": "",
+        "tags": [
+          "apex_jit"
+        ],
+        "args": "--condition 11"
+    },
+    {
+        "name": "apex_jit_install_condition12",
+        "dockerfile": "docker/base",
+        "scripts": "scripts",
+        "n_gpus": "8",
+        "owner": "skishore@amd.com",
+        "training_precision": "",
+        "tags": [
+          "apex_jit"
+        ],
+        "args": "--condition 12"
+    },
+    {
+        "name": "apex_jit_install_condition13",
+        "dockerfile": "docker/base",
+        "scripts": "scripts",
+        "n_gpus": "8",
+        "owner": "skishore@amd.com",
+        "training_precision": "",
+        "tags": [
+          "apex_jit"
+        ],
+        "args": "--condition 13"
+    }
+]
\ No newline at end of file
diff --git a/tests/jit_build/run_tests.sh b/tests/jit_build/run_tests.sh
new file mode 100644
index 000000000..eaed64629
--- /dev/null
+++ b/tests/jit_build/run_tests.sh
@@ -0,0 +1,36 @@
+#parse the arguments
+JIT_CONDITION="$2"
+echo "JIT_CONDITION $JIT_CONDITION"
+
+#run the apex unit tests
+LOG_FILE=results_jit_unit_test${JIT_CONDITION}.log
+LOG_FILE2=results_jit_unit_test${JIT_CONDITION}c.log
+
+cd tests/L0 
+PYTHONUNBUFFERED=1 sh run_rocm.sh 2>&1 | tee ../../$LOG_FILE
+cd ../../
+
+cd apex/contrib/test
+PYTHONUNBUFFERED=1 python run_rocm_extensions.py 2>&1 | tee ../../../$LOG_FILE2
+cd ../../../
+
+torchrun --nproc_per_node 8 apex/contrib/peer_memory/peer_halo_exchange_module_tests.py 2>&1 | tee -a $LOG_FILE2
+
+cd tests/distributed/synced_batchnorm
+sh unit_test.sh 2>&1 | tee -a ../../../$LOG_FILE2
+cd ../../../
+
+#explicitly load the builder and build the remaining extensions
+python tests/jit_build/load_extra_extensions.py 2>&1 | tee $LOG_FILE
+
+FAILED_TESTS=$(python tests/jit_build/count_failed_unit_tests.py $LOG_FILE)
+FAILED_TESTS2=$(python tests/jit_build/count_failed_unit_tests.py $LOG_FILE2)
+BUILT_SO_COUNT=$(python tests/jit_build/count_built_so.py)
+TORCH_EXTENSIONS_COUNT=$(python tests/jit_build/count_torch_extensions.py)
+
+echo "Failed L0 tests = $FAILED_TESTS"
+echo "Failed contrib tests = $FAILED_TESTS2"
+echo ".so count = $BUILT_SO_COUNT"
+echo "JIT torch extensions count = $TORCH_EXTENSIONS_COUNT"
+
+echo "$FAILED_TESTS $FAILED_TESTS2 $BUILT_SO_COUNT $TORCH_EXTENSIONS_COUNT"
\ No newline at end of file
diff --git a/tests/jit_build/scripts/run.sh b/tests/jit_build/scripts/run.sh
new file mode 100644
index 000000000..aeb41fadd
--- /dev/null
+++ b/tests/jit_build/scripts/run.sh
@@ -0,0 +1,25 @@
+#parse the arguments
+JIT_CONDITION="$2"
+
+echo $(pwd)
+
+WORKSPACE_DIR=/myworkspace
+mkdir -p $WORKSPACE_DIR 
+
+cd $WORKSPACE_DIR  
+git clone https://github.com/rocm/apex.git --recursive 
+cd apex 
+git checkout Refactor_build
+git submodule update --init --recursive 
+
+sh tests/jit_build/build.sh "condition" $JIT_CONDITION
+
+# Capture the output from run_tests.sh
+TEST_RESULTS=$(sh tests/jit_build/run_tests.sh "condition" $JIT_CONDITION | tail -1)
+
+# Parse the returned values
+read FAILED_TESTS FAILED_TESTS2 BUILT_SO_COUNT TORCH_EXTENSIONS_COUNT <<< "$TEST_RESULTS"
+
+MULTIPLE_RESULTS_FILE="../results_jit_unit_test.csv"
+#echo "condition,failed unit tests" > "$MULTIPLE_RESULTS_FILE"
+echo "$JIT_CONDITION,$FAILED_TESTS,$FAILED_TESTS2,$BUILT_SO_COUNT,$TORCH_EXTENSIONS_COUNT" >> "$MULTIPLE_RESULTS_FILE"
\ No newline at end of file
diff --git a/tests/test_extension_import.py b/tests/test_extension_import.py
index 153254ddd..72d88688e 100644
--- a/tests/test_extension_import.py
+++ b/tests/test_extension_import.py
@@ -2,15 +2,17 @@
 import os
 import subprocess
 import sys
-
+import site
+import ast
+from apex.op_builder.all_ops import ALL_OPS
 
 
 class TestExtensionImport(unittest.TestCase):
 
-    def get_extensions_list(self):
-        """
-        This method reads setup.py and gets the list of extensions from the setup.py file
-        """
+    def __init__(self, *args, **kwargs):
+        super(TestExtensionImport, self).__init__(*args, **kwargs)
+
+        self.jit_info_file = "apex/git_version_info_installed.py"
 
         #find the absolute path of this file
         current_file_path = os.path.abspath(__file__)
@@ -21,9 +23,24 @@ def get_extensions_list(self):
         #apex folder
         parent_folder_path = os.path.dirname(parent_folder_path)
         self.parent_folder_path = parent_folder_path
+
+    def is_jit_modules_mode(self):
+        """
+        This method checks if the file git_version_info_installed.py exists
+        """
+        jit_file_path = os.path.join(site.getsitepackages()[0], self.jit_info_file)
+        #print ("jit_file_path", jit_file_path)
+        mode = os.path.exists(jit_file_path)
+        print ("jit_mode", mode)
+        return mode
+
+    def get_extensions_list_from_setup(self):
+        """
+        This method reads setup.py and gets the list of extensions from the setup.py file
+        """
         
         #get setup.py file contents
-        setup_path = os.path.join(parent_folder_path, "setup.py")
+        setup_path = os.path.join(self.parent_folder_path, "setup.py")
 
         #read setup_path contents
         with open(setup_path, 'r') as f:
@@ -62,6 +79,21 @@ def get_extensions_list(self):
         return extensions
 
 
+    def get_jit_modules(self):
+        """
+        This method reads the jit file and extracts installed_ops dictionary
+        """
+        jit_info_path = os.path.join(site.getsitepackages()[0], self.jit_info_file)
+        with open(jit_info_path, 'r') as f:
+            lines = f.readlines()
+        for line in lines:
+            if "installed_ops" in line:
+                ops_list = line[len("installed_ops") + 1 : ]
+                ops_list = ast.literal_eval(ops_list)
+                #print ("op_list", ops_list)
+                return list(ops_list.keys())
+        return {}
+
     def get_environment(self):
         """
         This method retrieves the environment for testing import
@@ -122,10 +154,46 @@ def check_extension_import(self, extension_name, env):
             print(f"Error testing import for {extension_name}: {e}")
             return False, str(e)
 
+    def check_jit_extension_import(self, extension_name, env):
+        all_ops = dict.fromkeys(ALL_OPS.keys(), False)
+        #get the builder for that extension
+        builder = ALL_OPS[extension_name]
+        builder_name = type(builder).__name__
+        #print ("----builder_name-----", builder_name)
+
+        #increase timeout
+        timeout = 60 * 60
+        try:
+            # Run Python subprocess to test the import
+            result = subprocess.run([
+                sys.executable, '-c', 
+                'from apex.op_builder import ' + builder_name + 
+                '\n' + builder_name + "().load()"
+            ], capture_output=True, text=True, timeout=timeout, env=env)
+            print ("result.stdout", result.stdout, result.stderr)
+            # Check if subprocess completed successfully
+            if result.returncode != 0 and "Error" in result.stderr:
+                return False, result.stderr
+            else:
+                return True, ""
+                
+        except subprocess.TimeoutExpired:
+            print(f"Import test timed out for {extension_name}")
+            return False, "Timeout"
+        except Exception as e:
+            print(f"Error testing import for {extension_name}: {e}")
+            return False, str(e)
+
 
     def test_extensions_import(self):
-        #get the list of extensions
-        extensions = self.get_extensions_list()
+        #check the extensions mode
+        jit_mode = self.is_jit_modules_mode()
+
+        if not jit_mode:
+            #get the list of extensions from setup.py
+            extensions = self.get_extensions_list_from_setup()
+        else:
+            extensions = self.get_jit_modules()
 
         #get environment
         env = self.get_environment()
@@ -135,7 +203,10 @@ def test_extensions_import(self):
         for extension in extensions:
             print ("checking extension", extension)
             with self.subTest(extension=extension):
-                success, error_message = self.check_extension_import(extension, env)
+                if not jit_mode:
+                    success, error_message = self.check_extension_import(extension, env)
+                else:
+                    success, error_message = self.check_jit_extension_import(extension, env)
                 #self.assertTrue(success, f"Failed to import extension: {extension}")
                 results.append((extension, success, error_message))