diff --git a/.gitignore b/.gitignore index 5fe868b36..da67982aa 100644 --- a/.gitignore +++ b/.gitignore @@ -148,3 +148,7 @@ cython_debug/ *.hip *_hip.* *hip* + + +#file temporarily created for build process +apex/git_version_info_installed.py \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 000000000..a5dc0456c --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,2 @@ +recursive-include apex/contrib/csrc * +recursive-include apex/csrc * \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 000000000..99e44805f --- /dev/null +++ b/Makefile @@ -0,0 +1,17 @@ +PYTHON = python3 +PIP = $(PYTHON) -m pip + +clean: # This will remove ALL build folders. + @test -d build/ && echo "Deleting build folder" || true + @test -d build/ && rm -r build/ || true + @test -d dist/ && echo "Deleting dist folder" || true + @test -d dist/ && rm -r dist/ || true + @test -d apex.egg-info/ && echo "Deleting apex.egg-info folder" || true + @test -d apex.egg-info/ && rm -r apex.egg-info/ || true + + $(PYTHON) scripts/clean.py # remove the apex extensions installed at torch extensions folder + +aiter: + $(PIP) uninstall -y aiter + cd third_party/aiter && $(PIP) install . --no-build-isolation --no-deps + diff --git a/README.md b/README.md index 66bead147..81b647993 100644 --- a/README.md +++ b/README.md @@ -100,24 +100,21 @@ Note that we recommend restoring the model using the same `opt_level`. Also note # Installation ## Containers -ROCm pytorch containers are available from https://hub.docker.com/r/rocm/pytorch. +ROCm pytorch containers contain apex package and these are available from https://hub.docker.com/r/rocm/pytorch. ## From Source -To install Apex from source, we recommend using the nightly Pytorch obtainable from https://github.com/rocm/pytorch. +Torch must be installed before installing apex. We recommend using the nightly Pytorch obtainable from https://github.com/rocm/pytorch. The latest stable release obtainable from https://pytorch.org should also work. -The latest stable release obtainable from https://pytorch.org should also work. - -## ROCm Apex on ROCm supports both python only build and extension build. Note: Pytorch version recommended is >=1.5 for extension build. -### To install using python only build use the following command in apex folder: +### The following command will install all the extensions, which will be built and linked at runtime using [PyTorch's JIT (just-in-time) loader](https://pytorch.org/docs/stable/cpp_extension.html): +This requires ninja to be installed ``` -python setup.py install +pip install . --no-build-isolation ``` -======= ### Supported Versions | ``APEX Version`` | ``APEX branch`` | ``Torch Version`` | |------------------|-----------------|-------------------| @@ -140,26 +137,73 @@ ubuntu|pytorch|apex|release/1.0.0|06c33eee43f7a22f3ed7d9c3e5be0ddd757dc345|https centos|pytorch|apex|release/1.0.0|06c33eee43f7a22f3ed7d9c3e5be0ddd757dc345|https://github.com/ROCmSoftwarePlatform/apex ``` -### To install using extensions enabled use the following command in apex folder: +### To pre-build and install all the supported extensions while installing apex, use the following command in apex folder: +``` +APEX_BUILD_CPP_OPS=1 APEX_BUILD_CUDA_OPS=1 pip install . --no-build-isolation ``` -# if pip >= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key... -pip install -v --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ -# otherwise -python setup.py install --cpp_ext --cuda_ext +It is also possible to pre-build and install specific extensions by using the following command in apex folder: +``` +APEX_BUILD_=1 pip install . --no-build-isolation ``` -Note that using --cuda_ext flag to install Apex will also enable all the extensions supported on ROCm including "--distributed_adam", "--distributed_lamb", "--bnp", "--xentropy", "--deprecated_fused_adam", "--deprecated_fused_lamb", and "--fast_multihead_attn". +The following extensions are supported: +| extension | environment to build specific extension | install option | +|-----------|-----------|-----------| +| amp_C | APEX_BUILD_AMP_C=1 | APEX_BUILD_CUDA_OPS=1 | +| apex_C | APEX_BUILD_APEX_C=1 | APEX_BUILD_CPP_OPS=1 | +| bnp | APEX_BUILD_BNP=1 | APEX_BUILD_CUDA_OPS=1 | +| distributed_adam_cuda | APEX_BUILD_DISTRIBUTED_ADAM=1 | APEX_BUILD_CUDA_OPS=1 | +| distributed_lamb_cuda | APEX_BUILD_DISTRIBUTED_LAMB=1 | APEX_BUILD_CUDA_OPS=1 | +| fast_multihead_attn | APEX_BUILD_FAST_MULTIHEAD_ATTN=1 | APEX_BUILD_CUDA_OPS=1 | +| focal_loss_cuda | APEX_BUILD_FOCAL_LOSS=1 | APEX_BUILD_CUDA_OPS=1 | +| fused_adam_cuda | APEX_BUILD_FUSED_ADAM=1 | APEX_BUILD_CUDA_OPS=1 | +| fused_bias_swiglu | APEX_BUILD_FUSED_BIAS_SWIGLU=1 | APEX_BUILD_CUDA_OPS=1 | +| fused_dense_cuda | APEX_BUILD_FUSED_DENSE=1 | APEX_BUILD_CUDA_OPS=1 | +| fused_index_mul_2d | APEX_BUILD_FUSED_INDEX_MUL_2D=1 | APEX_BUILD_CUDA_OPS=1 | +| fused_lamb_cuda | APEX_BUILD_FUSED_LAMB=1 | APEX_BUILD_CUDA_OPS=1 | +| fused_layer_norm_cuda | APEX_BUILD_FUSED_LAYER_NORM=1 | APEX_BUILD_CUDA_OPS=1 | +| fused_rotary_positional_embedding | APEX_BUILD_FUSED_ROPE=1 | APEX_BUILD_CUDA_OPS=1 | +| fused_weight_gradient_mlp_cuda | APEX_BUILD_FUSED_WEIGHT_GRADIENT_MLP=1 | APEX_BUILD_CUDA_OPS=1 | +| generic_scaled_masked_softmax_cuda | APEX_BUILD_GENERIC_SCALED_MASKED_SOFTMAX_CUDA=1 | APEX_BUILD_CUDA_OPS=1 | +| mlp_cuda | APEX_BUILD_MLP=1 | APEX_BUILD_CUDA_OPS=1 | +| _apex_nccl_allocator | APEX_BUILD_NCCL_ALLOCATOR=1 | APEX_BUILD_CUDA_OPS=1 | +| nccl_p2p_cuda | APEX_BUILD_NCCL_P2P=1 | APEX_BUILD_CUDA_OPS=1 | +| peer_memory_cuda | APEX_BUILD_PEER_MEMORY=1 | APEX_BUILD_CUDA_OPS=1 | +| scaled_masked_softmax_cuda | APEX_BUILD_SCALED_MASKED_SOFTMAX_CUDA=1 | APEX_BUILD_CUDA_OPS=1 | +| scaled_softmax_cuda | APEX_BUILD_SCALED_SOFTMAX_CUDA=1 | APEX_BUILD_CUDA_OPS=1 | +| scaled_upper_triang_masked_softmax_cuda | APEX_BUILD_SCALED_UPPER_TRIANG_MASKED_SOFTMAX_CUDA=1 | APEX_BUILD_CUDA_OPS=1 | +| syncbn | APEX_BUILD_SYNCBN=1 | APEX_BUILD_CUDA_OPS=1 | +| transducer_joint_cuda | APEX_BUILD_TRANSDUCER_JOINT=1 | APEX_BUILD_CUDA_OPS=1 | +| transducer_loss_cuda | APEX_BUILD_TRANSDUCER_LOSS=1 | APEX_BUILD_CUDA_OPS=1 | +| xentropy_cuda | APEX_BUILD_XENTROPY=1 | APEX_BUILD_CUDA_OPS=1 | + +For example, to build FUSED_DENSE​ you can use the following command: +``` +APEX_BUILD_FUSED_DENSE​=1 pip install . --no-build-isolation +``` +This will pre-build and install FUSED_DENSE​ module and rest of the modules are installed to be JIT built and loaded at runtime. + -In addition, aiter backend can be built during apex installation by providing --aiter flag + +Aiter backend can be built and used for fused rope. To install aiter: ``` -# if pip >= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key... -pip install -v --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" --config-settings "--build-option=--aiter" ./ -# otherwise -python setup.py install --cpp_ext --cuda_ext --aiter +make aiter ``` To use aiter in fused rope, you can use the flag ```USE_ROCM_AITER_ROPE_BACKEND=1```. +### To create a wheel and then install apex using the wheel, use the following command in apex folder: +``` +python -m build --wheel --no-isolation (can use the same environment variables to build specific extensions, cpp extensions and cuda extensions) +pip install dist/apex-*.whl​ +``` + +### To uninstall apex and its extensions, use the following command in apex folder: +``` +pip uninstall apex +make clean +``` + ### Enable hipblasLT on ROCm hipblasLT is supported only on mi300 (gfx942) only. python setup.py automatically builds apex with hipblasLT support only if GPU device id is gfx942 @@ -173,33 +217,22 @@ CUDA and C++ extensions via ```bash git clone https://github.com/rocm/apex cd apex -# if pip >= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key... -pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ -# otherwise -pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --global-option="--cpp_ext" --global-option="--cuda_ext" ./ -``` - -Apex also supports a Python-only build via -```bash -pip install -v --disable-pip-version-check --no-build-isolation --no-cache-dir ./ +pip install . --no-build-isolation ``` -A Python-only build omits: -- Fused kernels required to use `apex.optimizers.FusedAdam`. -- Fused kernels required to use `apex.normalization.FusedLayerNorm` and `apex.normalization.FusedRMSNorm`. -- Fused kernels that improve the performance and numerical stability of `apex.parallel.SyncBatchNorm`. -- Fused kernels that improve the performance of `apex.parallel.DistributedDataParallel` and `apex.amp`. -`DistributedDataParallel`, `amp`, and `SyncBatchNorm` will still be usable, but they may be slower. ### [Experimental] Windows -`pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .` may work if you were able to build Pytorch from source -on your system. A Python-only build via `pip install -v --no-cache-dir .` is more likely to work. +`pip install . --no-build-isolation` may work if you were able to build Pytorch from source +on your system. A Python-only build via `pip install --no-build-isolation -v --no-cache-dir .` is more likely to work. If you installed Pytorch in a Conda environment, make sure to install Apex in that same environment. - # Release notes -# Release notes +## release/1.10.0 + +Build and installation related +- Support JIT (just-in-time) load cpp and CUDA extensions + ## release/1.9.0 - No new features were added in this release cycle. diff --git a/apex/contrib/test/run_rocm_extensions.py b/apex/contrib/test/run_rocm_extensions.py index c7801988b..1c9add5d8 100644 --- a/apex/contrib/test/run_rocm_extensions.py +++ b/apex/contrib/test/run_rocm_extensions.py @@ -2,25 +2,27 @@ import sys -test_dirs = ["groupbn", "fused_dense", "layer_norm", "multihead_attn", "transducer", "focal_loss", "index_mul_2d", "."] # "." for test_label_smoothing.py +test_dirs = ["groupbn", "layer_norm", "multihead_attn", "transducer", "focal_loss", "index_mul_2d", ".", \ + "optimizers", "clip_grad"] # "." for test_label_smoothing.py ROCM_BLACKLIST = [ "layer_norm" ] -runner = unittest.TextTestRunner(verbosity=2) +if __name__ == '__main__': + runner = unittest.TextTestRunner(verbosity=2) -errcode = 0 + errcode = 0 -for test_dir in test_dirs: - if test_dir in ROCM_BLACKLIST: - continue - suite = unittest.TestLoader().discover(test_dir) + for test_dir in test_dirs: + if test_dir in ROCM_BLACKLIST: + continue + suite = unittest.TestLoader().discover(test_dir) - print("\nExecuting tests from " + test_dir) + print("\nExecuting tests from " + test_dir) - result = runner.run(suite) + result = runner.run(suite) - if not result.wasSuccessful(): - errcode = 1 + if not result.wasSuccessful(): + errcode = 1 -sys.exit(errcode) + sys.exit(errcode) diff --git a/apex/csrc b/apex/csrc new file mode 120000 index 000000000..e96d28eb5 --- /dev/null +++ b/apex/csrc @@ -0,0 +1 @@ +../csrc \ No newline at end of file diff --git a/apex/git_version_info.py b/apex/git_version_info.py new file mode 100644 index 000000000..ee9e7c6c7 --- /dev/null +++ b/apex/git_version_info.py @@ -0,0 +1,34 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# Portions of this code were adapted from DeepSpeed: +# https://github.com/microsoft/DeepSpeed +# Modified for ROCm Apex + +try: + # This is populated by setup.py + from .git_version_info_installed import * # noqa: F401 # type: ignore +except ModuleNotFoundError: + import os + if os.path.isfile('version.txt'): + # Will be missing from checkouts that haven't been installed (e.g., readthedocs) + version = open('version.txt', 'r').read().strip() + else: + version = "0.0.0" + git_hash = '[none]' + git_branch = '[none]' + + from .op_builder.all_ops import ALL_OPS + installed_ops = dict.fromkeys(ALL_OPS.keys(), False) + torch_info = {'version': "0.0", "cuda_version": "0.0", "hip_version": "0.0"} + +# compatible_ops list is recreated for each launch +from .op_builder.all_ops import ALL_OPS + +compatible_ops = dict.fromkeys(ALL_OPS.keys(), False) +for op_name, builder in ALL_OPS.items(): + op_compatible = builder.is_compatible() + compatible_ops[op_name] = op_compatible + compatible_ops["apex_not_implemented"] = False \ No newline at end of file diff --git a/apex/op_builder b/apex/op_builder new file mode 120000 index 000000000..1e19f3e8d --- /dev/null +++ b/apex/op_builder @@ -0,0 +1 @@ +../op_builder \ No newline at end of file diff --git a/compatibility/__init__.py b/compatibility/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/compatibility/_apex_nccl_allocator.py b/compatibility/_apex_nccl_allocator.py new file mode 100644 index 000000000..6a029d1ee --- /dev/null +++ b/compatibility/_apex_nccl_allocator.py @@ -0,0 +1,37 @@ +import sys +import importlib + +class _ApexNcclAllocatorModule: + def __init__(self): + self._loaded_module = None + self._loading = False + + def _load_module(self): + if self._loaded_module is None and not self._loading: + self._loading = True + try: + apex_op_builder = importlib.import_module('apex.op_builder') + builder = getattr(apex_op_builder, 'NCCLAllocatorBuilder') + self._loaded_module = builder().load() + except Exception as e: + self._loading = False + raise ImportError(f"Failed to load _apex_nccl_allocator : {e}") + finally: + self._loading = False + return self._loaded_module + + def __getattr__(self, name): + if name.startswith("_") and name != "__class__": + raise AttributeError(f"module _apex_nccl_allocator has no attribute '{name}'") + return getattr(self._load_module(), name) + + def __dir__(self): + try: + return dir(self._load_module()) + except: + return [] + + def __repr__(self): + return "" + +sys.modules[__name__] = _ApexNcclAllocatorModule() \ No newline at end of file diff --git a/compatibility/amp_C.py b/compatibility/amp_C.py new file mode 100644 index 000000000..f9257c596 --- /dev/null +++ b/compatibility/amp_C.py @@ -0,0 +1,37 @@ +import sys +import importlib + +class _AmpCModule: + def __init__(self): + self._loaded_module = None + self._loading = False + + def _load_module(self): + if self._loaded_module is None and not self._loading: + self._loading = True + try: + apex_op_builder = importlib.import_module('apex.op_builder') + builder = getattr(apex_op_builder, 'AmpCBuilder') + self._loaded_module = builder().load() + except Exception as e: + self._loading = False + raise ImportError(f"Failed to load amp_C : {e}") + finally: + self._loading = False + return self._loaded_module + + def __getattr__(self, name): + if name.startswith("_"): + raise AttributeError(f"module amp_C has no attribute '{name}'") + return getattr(self._load_module(), name) + + def __dir__(self): + try: + return dir(self._load_module()) + except: + return [] + + def __repr__(self): + return "" + +sys.modules[__name__] = _AmpCModule() \ No newline at end of file diff --git a/compatibility/apex_C.py b/compatibility/apex_C.py new file mode 100644 index 000000000..39bac5264 --- /dev/null +++ b/compatibility/apex_C.py @@ -0,0 +1,37 @@ +import sys +import importlib + +class _ApexCModule: + def __init__(self): + self._loaded_module = None + self._loading = False + + def _load_module(self): + if self._loaded_module is None and not self._loading: + self._loading = True + try: + apex_op_builder = importlib.import_module('apex.op_builder') + builder = getattr(apex_op_builder, 'ApexCBuilder') + self._loaded_module = builder().load() + except Exception as e: + self._loading = False + raise ImportError(f"Failed to load apex_C : {e}") + finally: + self._loading = False + return self._loaded_module + + def __getattr__(self, name): + if name.startswith("_"): + raise AttributeError(f"module apex_C has no attribute '{name}'") + return getattr(self._load_module(), name) + + def __dir__(self): + try: + return dir(self._load_module()) + except: + return [] + + def __repr__(self): + return "" + +sys.modules[__name__] = _ApexCModule() \ No newline at end of file diff --git a/compatibility/bnp.py b/compatibility/bnp.py new file mode 100644 index 000000000..b03ba798c --- /dev/null +++ b/compatibility/bnp.py @@ -0,0 +1,37 @@ +import sys +import importlib + +class _BnpModule: + def __init__(self): + self._loaded_module = None + self._loading = False + + def _load_module(self): + if self._loaded_module is None and not self._loading: + self._loading = True + try: + apex_op_builder = importlib.import_module('apex.op_builder') + builder = getattr(apex_op_builder, 'BnpBuilder') + self._loaded_module = builder().load() + except Exception as e: + self._loading = False + raise ImportError(f"Failed to load bnp : {e}") + finally: + self._loading = False + return self._loaded_module + + def __getattr__(self, name): + if name.startswith("_"): + raise AttributeError(f"module bnp has no attribute '{name}'") + return getattr(self._load_module(), name) + + def __dir__(self): + try: + return dir(self._load_module()) + except: + return [] + + def __repr__(self): + return "" + +sys.modules[__name__] = _BnpModule() \ No newline at end of file diff --git a/compatibility/distributed_adam_cuda.py b/compatibility/distributed_adam_cuda.py new file mode 100644 index 000000000..2566dce11 --- /dev/null +++ b/compatibility/distributed_adam_cuda.py @@ -0,0 +1,37 @@ +import sys +import importlib + +class _DistributedAdamCudaModule: + def __init__(self): + self._loaded_module = None + self._loading = False + + def _load_module(self): + if self._loaded_module is None and not self._loading: + self._loading = True + try: + apex_op_builder = importlib.import_module('apex.op_builder') + builder = getattr(apex_op_builder, 'DistributedAdamBuilder') + self._loaded_module = builder().load() + except Exception as e: + self._loading = False + raise ImportError(f"Failed to load distributed_adam_cuda : {e}") + finally: + self._loading = False + return self._loaded_module + + def __getattr__(self, name): + if name.startswith("_"): + raise AttributeError(f"module distributed_adam_cuda has no attribute '{name}'") + return getattr(self._load_module(), name) + + def __dir__(self): + try: + return dir(self._load_module()) + except: + return [] + + def __repr__(self): + return "" + +sys.modules[__name__] = _DistributedAdamCudaModule() \ No newline at end of file diff --git a/compatibility/distributed_lamb_cuda.py b/compatibility/distributed_lamb_cuda.py new file mode 100644 index 000000000..7f0b64f3e --- /dev/null +++ b/compatibility/distributed_lamb_cuda.py @@ -0,0 +1,37 @@ +import sys +import importlib + +class _DistributedLambCudaModule: + def __init__(self): + self._loaded_module = None + self._loading = False + + def _load_module(self): + if self._loaded_module is None and not self._loading: + self._loading = True + try: + apex_op_builder = importlib.import_module('apex.op_builder') + builder = getattr(apex_op_builder, 'DistributedLambBuilder') + self._loaded_module = builder().load() + except Exception as e: + self._loading = False + raise ImportError(f"Failed to load distributed_lamb_cuda : {e}") + finally: + self._loading = False + return self._loaded_module + + def __getattr__(self, name): + if name.startswith("_"): + raise AttributeError(f"module distributed_lamb_cuda has no attribute '{name}'") + return getattr(self._load_module(), name) + + def __dir__(self): + try: + return dir(self._load_module()) + except: + return [] + + def __repr__(self): + return "" + +sys.modules[__name__] = _DistributedLambCudaModule() \ No newline at end of file diff --git a/compatibility/fast_multihead_attn.py b/compatibility/fast_multihead_attn.py new file mode 100644 index 000000000..a9e060b87 --- /dev/null +++ b/compatibility/fast_multihead_attn.py @@ -0,0 +1,37 @@ +import sys +import importlib + +class _FastMultiheadAttnModule: + def __init__(self): + self._loaded_module = None + self._loading = False + + def _load_module(self): + if self._loaded_module is None and not self._loading: + self._loading = True + try: + apex_op_builder = importlib.import_module('apex.op_builder') + builder = getattr(apex_op_builder, 'FastMultiheadAttnBuilder') + self._loaded_module = builder().load() + except Exception as e: + self._loading = False + raise ImportError(f"Failed to load fast_multihead_attn : {e}") + finally: + self._loading = False + return self._loaded_module + + def __getattr__(self, name): + if name.startswith("_"): + raise AttributeError(f"module fast_multihead_attn has no attribute '{name}'") + return getattr(self._load_module(), name) + + def __dir__(self): + try: + return dir(self._load_module()) + except: + return [] + + def __repr__(self): + return "" + +sys.modules[__name__] = _FastMultiheadAttnModule() \ No newline at end of file diff --git a/compatibility/focal_loss_cuda.py b/compatibility/focal_loss_cuda.py new file mode 100644 index 000000000..c7b364faf --- /dev/null +++ b/compatibility/focal_loss_cuda.py @@ -0,0 +1,37 @@ +import sys +import importlib + +class _FocalLossCudaModule: + def __init__(self): + self._loaded_module = None + self._loading = False + + def _load_module(self): + if self._loaded_module is None and not self._loading: + self._loading = True + try: + apex_op_builder = importlib.import_module('apex.op_builder') + builder = getattr(apex_op_builder, 'FocalLossBuilder') + self._loaded_module = builder().load() + except Exception as e: + self._loading = False + raise ImportError(f"Failed to load focal_loss_cuda : {e}") + finally: + self._loading = False + return self._loaded_module + + def __getattr__(self, name): + if name.startswith("_"): + raise AttributeError(f"module focal_loss_cuda has no attribute '{name}'") + return getattr(self._load_module(), name) + + def __dir__(self): + try: + return dir(self._load_module()) + except: + return [] + + def __repr__(self): + return "" + +sys.modules[__name__] = _FocalLossCudaModule() \ No newline at end of file diff --git a/compatibility/fused_adam_cuda.py b/compatibility/fused_adam_cuda.py new file mode 100644 index 000000000..bf31ca739 --- /dev/null +++ b/compatibility/fused_adam_cuda.py @@ -0,0 +1,37 @@ +import sys +import importlib + +class _FusedAdamCudaModule: + def __init__(self): + self._loaded_module = None + self._loading = False + + def _load_module(self): + if self._loaded_module is None and not self._loading: + self._loading = True + try: + apex_op_builder = importlib.import_module('apex.op_builder') + builder = getattr(apex_op_builder, 'FusedAdamBuilder') + self._loaded_module = builder().load() + except Exception as e: + self._loading = False + raise ImportError(f"Failed to load fused_adam_cuda : {e}") + finally: + self._loading = False + return self._loaded_module + + def __getattr__(self, name): + if name.startswith("_"): + raise AttributeError(f"module fused_adam_cuda has no attribute '{name}'") + return getattr(self._load_module(), name) + + def __dir__(self): + try: + return dir(self._load_module()) + except: + return [] + + def __repr__(self): + return "" + +sys.modules[__name__] = _FusedAdamCudaModule() \ No newline at end of file diff --git a/compatibility/fused_bias_swiglu.py b/compatibility/fused_bias_swiglu.py new file mode 100644 index 000000000..e9f066f4a --- /dev/null +++ b/compatibility/fused_bias_swiglu.py @@ -0,0 +1,37 @@ +import sys +import importlib + +class _FusedBiasSwiGLUModule: + def __init__(self): + self._loaded_module = None + self._loading = False + + def _load_module(self): + if self._loaded_module is None and not self._loading: + self._loading = True + try: + apex_op_builder = importlib.import_module('apex.op_builder') + builder = getattr(apex_op_builder, 'FusedBiasSwiGLUBuilder') + self._loaded_module = builder().load() + except Exception as e: + self._loading = False + raise ImportError(f"Failed to load fused_bias_swiglu : {e}") + finally: + self._loading = False + return self._loaded_module + + def __getattr__(self, name): + if name.startswith("_"): + raise AttributeError(f"module fused_bias_swiglu has no attribute '{name}'") + return getattr(self._load_module(), name) + + def __dir__(self): + try: + return dir(self._load_module()) + except: + return [] + + def __repr__(self): + return "" + +sys.modules[__name__] = _FusedBiasSwiGLUModule() \ No newline at end of file diff --git a/compatibility/fused_dense_cuda.py b/compatibility/fused_dense_cuda.py new file mode 100644 index 000000000..0d28badb2 --- /dev/null +++ b/compatibility/fused_dense_cuda.py @@ -0,0 +1,37 @@ +import sys +import importlib + +class _FusedDenseCudaModule: + def __init__(self): + self._loaded_module = None + self._loading = False + + def _load_module(self): + if self._loaded_module is None and not self._loading: + self._loading = True + try: + apex_op_builder = importlib.import_module('apex.op_builder') + builder = getattr(apex_op_builder, 'FusedDenseBuilder') + self._loaded_module = builder().load() + except Exception as e: + self._loading = False + raise ImportError(f"Failed to load fused_dense_cuda : {e}") + finally: + self._loading = False + return self._loaded_module + + def __getattr__(self, name): + if name.startswith("_"): + raise AttributeError(f"module fused_dense_cuda has no attribute '{name}'") + return getattr(self._load_module(), name) + + def __dir__(self): + try: + return dir(self._load_module()) + except: + return [] + + def __repr__(self): + return "" + +sys.modules[__name__] = _FusedDenseCudaModule() \ No newline at end of file diff --git a/compatibility/fused_index_mul_2d.py b/compatibility/fused_index_mul_2d.py new file mode 100644 index 000000000..c036877df --- /dev/null +++ b/compatibility/fused_index_mul_2d.py @@ -0,0 +1,37 @@ +import sys +import importlib + +class _FusedIndexMul2dModule: + def __init__(self): + self._loaded_module = None + self._loading = False + + def _load_module(self): + if self._loaded_module is None and not self._loading: + self._loading = True + try: + apex_op_builder = importlib.import_module('apex.op_builder') + builder = getattr(apex_op_builder, 'FusedIndexMul2dBuilder') + self._loaded_module = builder().load() + except Exception as e: + self._loading = False + raise ImportError(f"Failed to load fused_index_mul_2d : {e}") + finally: + self._loading = False + return self._loaded_module + + def __getattr__(self, name): + if name.startswith("_"): + raise AttributeError(f"module fused_index_mul_2d has no attribute '{name}'") + return getattr(self._load_module(), name) + + def __dir__(self): + try: + return dir(self._load_module()) + except: + return [] + + def __repr__(self): + return "" + +sys.modules[__name__] = _FusedIndexMul2dModule() \ No newline at end of file diff --git a/compatibility/fused_lamb_cuda.py b/compatibility/fused_lamb_cuda.py new file mode 100644 index 000000000..3ab88d443 --- /dev/null +++ b/compatibility/fused_lamb_cuda.py @@ -0,0 +1,37 @@ +import sys +import importlib + +class _FusedLambCudaModule: + def __init__(self): + self._loaded_module = None + self._loading = False + + def _load_module(self): + if self._loaded_module is None and not self._loading: + self._loading = True + try: + apex_op_builder = importlib.import_module('apex.op_builder') + builder = getattr(apex_op_builder, 'FusedLambBuilder') + self._loaded_module = builder().load() + except Exception as e: + self._loading = False + raise ImportError(f"Failed to load fused_lamb_cuda : {e}") + finally: + self._loading = False + return self._loaded_module + + def __getattr__(self, name): + if name.startswith("_"): + raise AttributeError(f"module fused_lamb_cuda has no attribute '{name}'") + return getattr(self._load_module(), name) + + def __dir__(self): + try: + return dir(self._load_module()) + except: + return [] + + def __repr__(self): + return "" + +sys.modules[__name__] = _FusedLambCudaModule() \ No newline at end of file diff --git a/compatibility/fused_layer_norm_cuda.py b/compatibility/fused_layer_norm_cuda.py new file mode 100644 index 000000000..2722e0252 --- /dev/null +++ b/compatibility/fused_layer_norm_cuda.py @@ -0,0 +1,44 @@ +import sys +import importlib + +class _FusedLayerCudaModule: + def __init__(self): + self._loaded_module = None + self._loading = False + + def _load_module(self): + if self._loaded_module is None and not self._loading: + self._loading = True + try: + #import the builder + apex_op_builder = importlib.import_module('apex.op_builder') + mlp_builder = getattr(apex_op_builder, 'FusedLayerNormBuilder') + + #load the module + self._loaded_module = mlp_builder().load() + except Exception as e: + self._loading = False + raise ImportError(f"Failed to load fused_layer_norm_cuda : {e}") + finally: + self._loading = False + return self._loaded_module + + def __getattr__(self, name): + if name.startswith("_"): + raise AttributeError(f"module fused_layer_norm_cuda has no attribute '{name}'") + + module = self._load_module() + return getattr(module, name) + + def __dir__(self): + try: + module = self._load_module() + return dir(module) + except: + return [] + + def __repr__(self): + return "" + +#replace module with lazy loader +sys.modules[__name__] = _FusedLayerCudaModule() \ No newline at end of file diff --git a/compatibility/fused_rotary_positional_embedding.py b/compatibility/fused_rotary_positional_embedding.py new file mode 100644 index 000000000..d4f87bd33 --- /dev/null +++ b/compatibility/fused_rotary_positional_embedding.py @@ -0,0 +1,37 @@ +import sys +import importlib + +class _FusedRotaryPositionalEmbeddingModule: + def __init__(self): + self._loaded_module = None + self._loading = False + + def _load_module(self): + if self._loaded_module is None and not self._loading: + self._loading = True + try: + apex_op_builder = importlib.import_module('apex.op_builder') + builder = getattr(apex_op_builder, 'FusedRopeBuilder') + self._loaded_module = builder().load() + except Exception as e: + self._loading = False + raise ImportError(f"Failed to load fused_rotary_positional_embedding : {e}") + finally: + self._loading = False + return self._loaded_module + + def __getattr__(self, name): + if name.startswith("_"): + raise AttributeError(f"module fused_rotary_positional_embedding has no attribute '{name}'") + return getattr(self._load_module(), name) + + def __dir__(self): + try: + return dir(self._load_module()) + except: + return [] + + def __repr__(self): + return "" + +sys.modules[__name__] = _FusedRotaryPositionalEmbeddingModule() \ No newline at end of file diff --git a/compatibility/fused_weight_gradient_mlp_cuda.py b/compatibility/fused_weight_gradient_mlp_cuda.py new file mode 100644 index 000000000..219d9355b --- /dev/null +++ b/compatibility/fused_weight_gradient_mlp_cuda.py @@ -0,0 +1,37 @@ +import sys +import importlib + +class _FusedWeightGradientMlpCudaModule: + def __init__(self): + self._loaded_module = None + self._loading = False + + def _load_module(self): + if self._loaded_module is None and not self._loading: + self._loading = True + try: + apex_op_builder = importlib.import_module('apex.op_builder') + builder = getattr(apex_op_builder, 'FusedWeightGradientMlpCudaBuilder') + self._loaded_module = builder().load() + except Exception as e: + self._loading = False + raise ImportError(f"Failed to load fused_weight_gradient_mlp_cuda : {e}") + finally: + self._loading = False + return self._loaded_module + + def __getattr__(self, name): + if name.startswith("_"): + raise AttributeError(f"module fused_weight_gradient_mlp_cuda has no attribute '{name}'") + return getattr(self._load_module(), name) + + def __dir__(self): + try: + return dir(self._load_module()) + except: + return [] + + def __repr__(self): + return "" + +sys.modules[__name__] = _FusedWeightGradientMlpCudaModule() \ No newline at end of file diff --git a/compatibility/generic_scaled_masked_softmax_cuda.py b/compatibility/generic_scaled_masked_softmax_cuda.py new file mode 100644 index 000000000..fa50ca52c --- /dev/null +++ b/compatibility/generic_scaled_masked_softmax_cuda.py @@ -0,0 +1,37 @@ +import sys +import importlib + +class _GenericScaledMaskedSoftmaxCudaModule: + def __init__(self): + self._loaded_module = None + self._loading = False + + def _load_module(self): + if self._loaded_module is None and not self._loading: + self._loading = True + try: + apex_op_builder = importlib.import_module('apex.op_builder') + builder = getattr(apex_op_builder, 'GenericScaledMaskedSoftmaxCudaBuilder') + self._loaded_module = builder().load() + except Exception as e: + self._loading = False + raise ImportError(f"Failed to load generic_scaled_masked_softmax_cuda : {e}") + finally: + self._loading = False + return self._loaded_module + + def __getattr__(self, name): + if name.startswith("_"): + raise AttributeError(f"module generic_scaled_masked_softmax_cuda has no attribute '{name}'") + return getattr(self._load_module(), name) + + def __dir__(self): + try: + return dir(self._load_module()) + except: + return [] + + def __repr__(self): + return "" + +sys.modules[__name__] = _GenericScaledMaskedSoftmaxCudaModule() \ No newline at end of file diff --git a/compatibility/mlp_cuda.py b/compatibility/mlp_cuda.py new file mode 100644 index 000000000..4c873d560 --- /dev/null +++ b/compatibility/mlp_cuda.py @@ -0,0 +1,44 @@ +import sys +import importlib + +class _MLPCudaModule: + def __init__(self): + self._loaded_module = None + self._loading = False + + def _load_module(self): + if self._loaded_module is None and not self._loading: + self._loading = True + try: + #import the builder + apex_op_builder = importlib.import_module('apex.op_builder') + mlp_builder = getattr(apex_op_builder, 'MlpBuilder') + + #load the module + self._loaded_module = mlp_builder().load() + except Exception as e: + self._loading = False + raise ImportError(f"Failed to load mlp_cuda : {e}") + finally: + self._loading = False + return self._loaded_module + + def __getattr__(self, name): + if name.startswith("_"): + raise AttributeError(f"module mlp_cuda has no attribute '{name}'") + + module = self._load_module() + return getattr(module, name) + + def __dir__(self): + try: + module = self._load_module() + return dir(module) + except: + return [] + + def __repr__(self): + return "" + +#replace module with lazy loader +sys.modules[__name__] = _MLPCudaModule() \ No newline at end of file diff --git a/compatibility/nccl_p2p_cuda.py b/compatibility/nccl_p2p_cuda.py new file mode 100644 index 000000000..d937cb95e --- /dev/null +++ b/compatibility/nccl_p2p_cuda.py @@ -0,0 +1,37 @@ +import sys +import importlib + +class _NcclP2pCudaModule: + def __init__(self): + self._loaded_module = None + self._loading = False + + def _load_module(self): + if self._loaded_module is None and not self._loading: + self._loading = True + try: + apex_op_builder = importlib.import_module('apex.op_builder') + builder = getattr(apex_op_builder, 'NCCLP2PBuilder') + self._loaded_module = builder().load() + except Exception as e: + self._loading = False + raise ImportError(f"Failed to load nccl_p2p_cuda : {e}") + finally: + self._loading = False + return self._loaded_module + + def __getattr__(self, name): + if name.startswith("_"): + raise AttributeError(f"module nccl_p2p_cuda has no attribute '{name}'") + return getattr(self._load_module(), name) + + def __dir__(self): + try: + return dir(self._load_module()) + except: + return [] + + def __repr__(self): + return "" + +sys.modules[__name__] = _NcclP2pCudaModule() \ No newline at end of file diff --git a/compatibility/peer_memory_cuda.py b/compatibility/peer_memory_cuda.py new file mode 100644 index 000000000..d909ec1b9 --- /dev/null +++ b/compatibility/peer_memory_cuda.py @@ -0,0 +1,37 @@ +import sys +import importlib + +class _PeerMemoryCudaModule: + def __init__(self): + self._loaded_module = None + self._loading = False + + def _load_module(self): + if self._loaded_module is None and not self._loading: + self._loading = True + try: + apex_op_builder = importlib.import_module('apex.op_builder') + builder = getattr(apex_op_builder, 'PeerMemoryBuilder') + self._loaded_module = builder().load() + except Exception as e: + self._loading = False + raise ImportError(f"Failed to load peer_memory_cuda : {e}") + finally: + self._loading = False + return self._loaded_module + + def __getattr__(self, name): + if name.startswith("_"): + raise AttributeError(f"module peer_memory_cuda has no attribute '{name}'") + return getattr(self._load_module(), name) + + def __dir__(self): + try: + return dir(self._load_module()) + except: + return [] + + def __repr__(self): + return "" + +sys.modules[__name__] = _PeerMemoryCudaModule() \ No newline at end of file diff --git a/compatibility/scaled_masked_softmax_cuda.py b/compatibility/scaled_masked_softmax_cuda.py new file mode 100644 index 000000000..77ed74e47 --- /dev/null +++ b/compatibility/scaled_masked_softmax_cuda.py @@ -0,0 +1,37 @@ +import sys +import importlib + +class _ScaledMaskedSoftmaxCudaModule: + def __init__(self): + self._loaded_module = None + self._loading = False + + def _load_module(self): + if self._loaded_module is None and not self._loading: + self._loading = True + try: + apex_op_builder = importlib.import_module('apex.op_builder') + builder = getattr(apex_op_builder, 'ScaledMaskedSoftmaxCudaBuilder') + self._loaded_module = builder().load() + except Exception as e: + self._loading = False + raise ImportError(f"Failed to load scaled_masked_softmax_cuda : {e}") + finally: + self._loading = False + return self._loaded_module + + def __getattr__(self, name): + if name.startswith("_"): + raise AttributeError(f"module scaled_masked_softmax_cuda has no attribute '{name}'") + return getattr(self._load_module(), name) + + def __dir__(self): + try: + return dir(self._load_module()) + except: + return [] + + def __repr__(self): + return "" + +sys.modules[__name__] = _ScaledMaskedSoftmaxCudaModule() \ No newline at end of file diff --git a/compatibility/scaled_softmax_cuda.py b/compatibility/scaled_softmax_cuda.py new file mode 100644 index 000000000..d7a4427e3 --- /dev/null +++ b/compatibility/scaled_softmax_cuda.py @@ -0,0 +1,37 @@ +import sys +import importlib + +class _ScaledSoftmaxCudaModule: + def __init__(self): + self._loaded_module = None + self._loading = False + + def _load_module(self): + if self._loaded_module is None and not self._loading: + self._loading = True + try: + apex_op_builder = importlib.import_module('apex.op_builder') + builder = getattr(apex_op_builder, 'ScaledSoftmaxCudaBuilder') + self._loaded_module = builder().load() + except Exception as e: + self._loading = False + raise ImportError(f"Failed to load scaled_softmax_cuda : {e}") + finally: + self._loading = False + return self._loaded_module + + def __getattr__(self, name): + if name.startswith("_"): + raise AttributeError(f"module scaled_softmax_cuda has no attribute '{name}'") + return getattr(self._load_module(), name) + + def __dir__(self): + try: + return dir(self._load_module()) + except: + return [] + + def __repr__(self): + return "" + +sys.modules[__name__] = _ScaledSoftmaxCudaModule() \ No newline at end of file diff --git a/compatibility/scaled_upper_triang_masked_softmax_cuda.py b/compatibility/scaled_upper_triang_masked_softmax_cuda.py new file mode 100644 index 000000000..8da9b5c67 --- /dev/null +++ b/compatibility/scaled_upper_triang_masked_softmax_cuda.py @@ -0,0 +1,38 @@ +import sys +import importlib + +class _ScaledUpperTriangMaskedSoftmaxCudaModule: + def __init__(self): + self._loaded_module = None + self._loading = False + + def _load_module(self): + if self._loaded_module is None and not self._loading: + self._loading = True + try: + apex_op_builder = importlib.import_module('apex.op_builder') + name = 'ScaledUpperTriangMaskedSoftmaxCudaBuilder' + builder = getattr(apex_op_builder, name) + self._loaded_module = builder().load() + except Exception as e: + self._loading = False + raise ImportError(f"Failed to load scaled_upper_triang_masked_softmax_cuda : {e}") + finally: + self._loading = False + return self._loaded_module + + def __getattr__(self, name_attr): + if name_attr.startswith("_"): + raise AttributeError(f"module scaled_upper_triang_masked_softmax_cuda has no attribute '{name_attr}'") + return getattr(self._load_module(), name_attr) + + def __dir__(self): + try: + return dir(self._load_module()) + except: + return [] + + def __repr__(self): + return "" + +sys.modules[__name__] = _ScaledUpperTriangMaskedSoftmaxCudaModule() \ No newline at end of file diff --git a/compatibility/syncbn.py b/compatibility/syncbn.py new file mode 100644 index 000000000..b619575dc --- /dev/null +++ b/compatibility/syncbn.py @@ -0,0 +1,37 @@ +import sys +import importlib + +class _SyncbnModule: + def __init__(self): + self._loaded_module = None + self._loading = False + + def _load_module(self): + if self._loaded_module is None and not self._loading: + self._loading = True + try: + apex_op_builder = importlib.import_module('apex.op_builder') + builder = getattr(apex_op_builder, 'SyncBnBuilder') + self._loaded_module = builder().load() + except Exception as e: + self._loading = False + raise ImportError(f"Failed to load syncbn : {e}") + finally: + self._loading = False + return self._loaded_module + + def __getattr__(self, name): + if name.startswith("_"): + raise AttributeError(f"module syncbn has no attribute '{name}'") + return getattr(self._load_module(), name) + + def __dir__(self): + try: + return dir(self._load_module()) + except: + return [] + + def __repr__(self): + return "" + +sys.modules[__name__] = _SyncbnModule() \ No newline at end of file diff --git a/compatibility/transducer_joint_cuda.py b/compatibility/transducer_joint_cuda.py new file mode 100644 index 000000000..e06705fde --- /dev/null +++ b/compatibility/transducer_joint_cuda.py @@ -0,0 +1,37 @@ +import sys +import importlib + +class _TransducerJointCudaModule: + def __init__(self): + self._loaded_module = None + self._loading = False + + def _load_module(self): + if self._loaded_module is None and not self._loading: + self._loading = True + try: + apex_op_builder = importlib.import_module('apex.op_builder') + builder = getattr(apex_op_builder, 'TransducerJointBuilder') + self._loaded_module = builder().load() + except Exception as e: + self._loading = False + raise ImportError(f"Failed to load transducer_joint_cuda : {e}") + finally: + self._loading = False + return self._loaded_module + + def __getattr__(self, name): + if name.startswith("_"): + raise AttributeError(f"module transducer_joint_cuda has no attribute '{name}'") + return getattr(self._load_module(), name) + + def __dir__(self): + try: + return dir(self._load_module()) + except: + return [] + + def __repr__(self): + return "" + +sys.modules[__name__] = _TransducerJointCudaModule() \ No newline at end of file diff --git a/compatibility/transducer_loss_cuda.py b/compatibility/transducer_loss_cuda.py new file mode 100644 index 000000000..d5a2c0f36 --- /dev/null +++ b/compatibility/transducer_loss_cuda.py @@ -0,0 +1,37 @@ +import sys +import importlib + +class _TransducerLossCudaModule: + def __init__(self): + self._loaded_module = None + self._loading = False + + def _load_module(self): + if self._loaded_module is None and not self._loading: + self._loading = True + try: + apex_op_builder = importlib.import_module('apex.op_builder') + builder = getattr(apex_op_builder, 'TransducerLossBuilder') + self._loaded_module = builder().load() + except Exception as e: + self._loading = False + raise ImportError(f"Failed to load transducer_loss_cuda : {e}") + finally: + self._loading = False + return self._loaded_module + + def __getattr__(self, name): + if name.startswith("_"): + raise AttributeError(f"module transducer_loss_cuda has no attribute '{name}'") + return getattr(self._load_module(), name) + + def __dir__(self): + try: + return dir(self._load_module()) + except: + return [] + + def __repr__(self): + return "" + +sys.modules[__name__] = _TransducerLossCudaModule() \ No newline at end of file diff --git a/compatibility/xentropy_cuda.py b/compatibility/xentropy_cuda.py new file mode 100644 index 000000000..ff4dc9733 --- /dev/null +++ b/compatibility/xentropy_cuda.py @@ -0,0 +1,37 @@ +import sys +import importlib + +class _XentropyCudaModule: + def __init__(self): + self._loaded_module = None + self._loading = False + + def _load_module(self): + if self._loaded_module is None and not self._loading: + self._loading = True + try: + apex_op_builder = importlib.import_module('apex.op_builder') + builder = getattr(apex_op_builder, 'XentropyBuilder') + self._loaded_module = builder().load() + except Exception as e: + self._loading = False + raise ImportError(f"Failed to load xentropy_cuda : {e}") + finally: + self._loading = False + return self._loaded_module + + def __getattr__(self, name): + if name.startswith("_"): + raise AttributeError(f"module xentropy_cuda has no attribute '{name}'") + return getattr(self._load_module(), name) + + def __dir__(self): + try: + return dir(self._load_module()) + except: + return [] + + def __repr__(self): + return "" + +sys.modules[__name__] = _XentropyCudaModule() \ No newline at end of file diff --git a/contrib/csrc b/contrib/csrc new file mode 120000 index 000000000..4e941d8b2 --- /dev/null +++ b/contrib/csrc @@ -0,0 +1 @@ +../apex/contrib/csrc \ No newline at end of file diff --git a/op_builder/__init__.py b/op_builder/__init__.py new file mode 100644 index 000000000..726ec6f4d --- /dev/null +++ b/op_builder/__init__.py @@ -0,0 +1,56 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# Portions of this code were adapted from DeepSpeed: +# https://github.com/microsoft/DeepSpeed +# Modified for ROCm Apex + +import sys +import os +import pkgutil +import importlib + +from .builder import get_default_compute_capabilities, OpBuilder + +__apex__ = True + +# List of all available op builders from apex op_builder +try: + import apex.op_builder # noqa: F401 # type: ignore + op_builder_dir = "apex.op_builder" +except ImportError: + op_builder_dir = "op_builder" + +__op_builders__ = [] + +this_module = sys.modules[__name__] + + +def builder_closure(member_name): + if op_builder_dir == "op_builder": + # during installation time cannot get builder due to torch not installed, + # return closure instead + def _builder(): + from apex.op_builder.all_ops import BuilderUtils + builder = BuilderUtils().create_op_builder(member_name) + return builder + + return _builder + else: + # during runtime, return op builder class directly + from apex.op_builder.all_ops import BuilderUtils + builder = BuilderUtils().get_op_builder(member_name) + return builder + +# this is for the import statement such as 'from apex.op_builder import FusedLayerNormBuilder' to work +# reflect builder names and add builder closure, such as 'apex.op_builder.FusedLayerNormBuilder()' creates op builder +for _, module_name, _ in pkgutil.iter_modules([os.path.dirname(this_module.__file__)]): + if module_name != 'all_ops' and module_name != 'builder': + module = importlib.import_module(f".{module_name}", package=op_builder_dir) + for member_name in module.__dir__(): + if member_name.endswith('Builder') and member_name != "OpBuilder" and member_name != "CUDAOpBuilder" and member_name != "CPUOpBuilder": + # assign builder name to variable with same name + # the following is equivalent to i.e. TransformerBuilder = "TransformerBuilder" + this_module.__dict__[member_name] = builder_closure(member_name) \ No newline at end of file diff --git a/op_builder/all_ops.py b/op_builder/all_ops.py new file mode 100644 index 000000000..e18dbdd71 --- /dev/null +++ b/op_builder/all_ops.py @@ -0,0 +1,87 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# Portions of this code were adapted from DeepSpeed: +# https://github.com/microsoft/DeepSpeed +# Modified for ROCm Apex + +import os +import pkgutil +import importlib + +class BuilderUtils: + def op_builder_dir(self): + try: + # is op_builder from apex or a 3p version? this should only succeed if it's apex + # if successful this also means we're doing a local install and not JIT compile path + from op_builder import __apex__ + return "op_builder" + except ImportError: + return "apex.op_builder" + + # dict that holds class name <--> class type mapping i.e. + # 'AsyncIOBuilder': + # this dict will be filled at init stage + class_dict = None + + def _lazy_init_class_dict(self): + if self.class_dict is not None: + return + else: + self.class_dict = {} + # begin initialize for create_op_builder() + # put all valid class name <--> class type mapping into class_dict + op_builder_dir = self.op_builder_dir() + op_builder_module = importlib.import_module(op_builder_dir) + op_builder_absolute_path = os.path.dirname(op_builder_module.__file__) + for _, module_name, _ in pkgutil.iter_modules([op_builder_absolute_path]): + # avoid self references, + # skip sub_directories which contains ops for other backend(cpu, npu, etc.). + if module_name != 'all_ops' and module_name != 'builder' and not os.path.isdir( + os.path.join(op_builder_absolute_path, module_name)): + module = importlib.import_module("{}.{}".format(op_builder_dir, module_name)) + for member_name in module.__dir__(): + if member_name.endswith( + 'Builder' + ) and member_name != "OpBuilder" and member_name != "CUDAOpBuilder" and member_name != "CPUOpBuilder": # avoid abstract classes + if not member_name in self.class_dict: + self.class_dict[member_name] = getattr(module, member_name) + # end initialize for create_op_builder() + + # create an instance of op builder and return, name specified by class_name + def create_op_builder(self, class_name): + self._lazy_init_class_dict() + if class_name in self.class_dict: + return self.class_dict[class_name]() + else: + return None + + # return an op builder class, name specified by class_name + def get_op_builder(self, class_name): + self._lazy_init_class_dict() + if class_name in self.class_dict: + return self.class_dict[class_name] + else: + return None + +# List of all available ops + +# append all builder names into __op_builders__ +builder_utils = BuilderUtils() +op_builder_dir = builder_utils.op_builder_dir() +op_builder_module = importlib.import_module(op_builder_dir) +__op_builders__ = [] + +for _, module_name, _ in pkgutil.iter_modules([os.path.dirname(op_builder_module.__file__)]): + # avoid self references + if module_name != 'all_ops' and module_name != 'builder': + module = importlib.import_module("{}.{}".format(op_builder_dir, module_name)) + for member_name in module.__dir__(): + if member_name.endswith('Builder'): + # append builder to __op_builders__ list + builder = builder_utils.create_op_builder(member_name) + __op_builders__.append(builder) + +ALL_OPS = {op.name: op for op in __op_builders__ if op is not None} \ No newline at end of file diff --git a/op_builder/amp_C.py b/op_builder/amp_C.py new file mode 100644 index 000000000..41f029fcb --- /dev/null +++ b/op_builder/amp_C.py @@ -0,0 +1,45 @@ +from .builder import CUDAOpBuilder + +import sys + + +class AmpCBuilder(CUDAOpBuilder): + BUILD_VAR = 'APEX_BUILD_AMP_C' + INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS" + NAME = "amp_C" + + def __init__(self): + super().__init__(name=self.NAME) + + def absolute_name(self): + return f'apex.{self.NAME}' + + def sources(self): + return ['csrc/amp_C_frontend.cpp', + 'csrc/multi_tensor_sgd_kernel.cu', + 'csrc/multi_tensor_scale_kernel.cu', + 'csrc/multi_tensor_axpby_kernel.cu', + 'csrc/multi_tensor_l2norm_kernel.cu', + 'csrc/multi_tensor_l2norm_kernel_mp.cu', + 'csrc/multi_tensor_l2norm_scale_kernel.cu', + 'csrc/multi_tensor_lamb_stage_1.cu', + 'csrc/multi_tensor_lamb_stage_2.cu', + 'csrc/multi_tensor_adam.cu', + 'csrc/multi_tensor_adagrad.cu', + 'csrc/multi_tensor_novograd.cu', + 'csrc/multi_tensor_lars.cu', + 'csrc/multi_tensor_lamb.cu', + 'csrc/multi_tensor_lamb_mp.cu'] + + def include_paths(self): + return ['csrc/'] + + def cxx_args(self): + args = super().cxx_args() + return args + self.version_dependent_macros() + + def nvcc_args(self): + nvcc_flags = ['-O3'] + self.version_dependent_macros() + if not self.is_rocm_pytorch(): + nvcc_flags += ['-lineinfo', '--use_fast_math'] + return nvcc_flags \ No newline at end of file diff --git a/op_builder/apex_C.py b/op_builder/apex_C.py new file mode 100644 index 000000000..b02526e77 --- /dev/null +++ b/op_builder/apex_C.py @@ -0,0 +1,25 @@ +from .builder import CPUOpBuilder + +import sys + + +class ApexCBuilder(CPUOpBuilder): + BUILD_VAR = 'APEX_BUILD_APEX_C' + INCLUDE_FLAG = "APEX_BUILD_CPP_OPS" + NAME = "apex_C" + + def __init__(self): + super().__init__(name=self.NAME) + + def absolute_name(self): + return f'apex.{self.NAME}' + + def sources(self): + return ["csrc/flatten_unflatten.cpp"] + + def include_paths(self): + return ['csrc/' ] + + def libraries_args(self): + args = super().libraries_args() + return args \ No newline at end of file diff --git a/op_builder/bnp.py b/op_builder/bnp.py new file mode 100644 index 000000000..f7fbe1abd --- /dev/null +++ b/op_builder/bnp.py @@ -0,0 +1,33 @@ +from .builder import CUDAOpBuilder + +import sys + + +class BnpBuilder(CUDAOpBuilder): + BUILD_VAR = 'APEX_BUILD_BNP' + INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS" + NAME = "bnp" + + def __init__(self): + super().__init__(name=self.NAME) + + def absolute_name(self): + return f'apex.{self.NAME}' + + def sources(self): + return ['contrib/csrc/groupbn/batch_norm.cu', + 'contrib/csrc/groupbn/ipc.cu', + 'contrib/csrc/groupbn/interface.cpp', + 'contrib/csrc/groupbn/batch_norm_add_relu.cu'] + + def include_paths(self): + return ['contrib/csrc', 'csrc'] + + def cxx_args(self): + return self.version_dependent_macros() + + def nvcc_args(self): + return ['-DCUDA_HAS_FP16=1', + '-D__CUDA_NO_HALF_OPERATORS__', + '-D__CUDA_NO_HALF_CONVERSIONS__', + '-D__CUDA_NO_HALF2_OPERATORS__'] + self.version_dependent_macros() \ No newline at end of file diff --git a/op_builder/builder.py b/op_builder/builder.py new file mode 100644 index 000000000..60e490b2b --- /dev/null +++ b/op_builder/builder.py @@ -0,0 +1,927 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# Portions of this code were adapted from DeepSpeed: +# https://github.com/microsoft/DeepSpeed +# Modified for ROCm Apex + +import os +import re +import sys +import time +import importlib +from pathlib import Path +import subprocess +import shlex +import shutil +import tempfile +import distutils.ccompiler +import distutils.log +import distutils.sysconfig +from distutils.errors import CompileError, LinkError +from abc import ABC, abstractmethod +from typing import List + +YELLOW = '\033[93m' +END = '\033[0m' +WARNING = f"{YELLOW} [WARNING] {END}" + +DEFAULT_TORCH_EXTENSION_PATH = "/tmp/torch_extensions" +DEFAULT_COMPUTE_CAPABILITIES = "6.0;6.1;7.0" + +try: + import torch +except ImportError: + print(f"{WARNING} unable to import torch, please install it if you want to pre-compile any apex ops.") +else: + TORCH_MAJOR = int(torch.__version__.split('.')[0]) + TORCH_MINOR = int(torch.__version__.split('.')[1]) + + +class MissingCUDAException(Exception): + pass + + +class CUDAMismatchException(Exception): + pass + + +def installed_cuda_version(name=""): + import torch.utils.cpp_extension + cuda_home = torch.utils.cpp_extension.CUDA_HOME + if cuda_home is None: + raise MissingCUDAException("CUDA_HOME does not exist, unable to compile CUDA op(s)") + # Ensure there is not a cuda version mismatch between torch and nvcc compiler + output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"], universal_newlines=True) + output_split = output.split() + release_idx = output_split.index("release") + release = output_split[release_idx + 1].replace(',', '').split(".") + # Ignore patch versions, only look at major + minor + cuda_major, cuda_minor = release[:2] + return int(cuda_major), int(cuda_minor) + + +def get_default_compute_capabilities(): + compute_caps = DEFAULT_COMPUTE_CAPABILITIES + # Update compute capability according to: https://en.wikipedia.org/wiki/CUDA#GPUs_supported + import torch.utils.cpp_extension + if torch.utils.cpp_extension.CUDA_HOME is not None: + if installed_cuda_version()[0] == 11: + if installed_cuda_version()[1] >= 0: + compute_caps += ";8.0" + if installed_cuda_version()[1] >= 1: + compute_caps += ";8.6" + if installed_cuda_version()[1] >= 8: + compute_caps += ";9.0" + elif installed_cuda_version()[0] == 12: + compute_caps += ";8.0;8.6;9.0" + if installed_cuda_version()[1] >= 8: + compute_caps += ";10.0;12.0" + return compute_caps + + +# list compatible minor CUDA versions - so that for example pytorch built with cuda-11.0 can be used +# to build apex and system-wide installed cuda 11.2 +cuda_minor_mismatch_ok = { + 10: ["10.0", "10.1", "10.2"], + 11: ["11.0", "11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8"], + 12: ["12.0", "12.1", "12.2", "12.3", "12.4", "12.5", "12.6", + "12.8"], # There does not appear to be a CUDA Toolkit 12.7 +} + + +def assert_no_cuda_mismatch(name=""): + cuda_major, cuda_minor = installed_cuda_version(name) + sys_cuda_version = f'{cuda_major}.{cuda_minor}' + torch_cuda_version = ".".join(torch.version.cuda.split('.')[:2]) + # This is a show-stopping error, should probably not proceed past this + if sys_cuda_version != torch_cuda_version: + if (cuda_major in cuda_minor_mismatch_ok and sys_cuda_version in cuda_minor_mismatch_ok[cuda_major] + and torch_cuda_version in cuda_minor_mismatch_ok[cuda_major]): + print(f"Installed CUDA version {sys_cuda_version} does not match the " + f"version torch was compiled with {torch.version.cuda} " + "but since the APIs are compatible, accepting this combination") + return True + elif os.getenv("APEX_SKIP_CUDA_CHECK", "0") == "1": + print( + f"{WARNING} Apex Op Builder: Installed CUDA version {sys_cuda_version} does not match the " + f"version torch was compiled with {torch.version.cuda}." + "Detected `APEX_SKIP_CUDA_CHECK=1`: Allowing this combination of CUDA, but it may result in unexpected behavior." + ) + return True + raise CUDAMismatchException( + f">- Apex Op Builder: Installed CUDA version {sys_cuda_version} does not match the " + f"version torch was compiled with {torch.version.cuda}, unable to compile " + "cuda/cpp extensions without a matching cuda version.") + return True + + +class OpBuilder(ABC): + _rocm_version = None + _rocm_gpu_arch = None + _rocm_wavefront_size = None + _is_rocm_pytorch = None + _is_sycl_enabled = None + _loaded_ops = {} + + def __init__(self, name): + self.name = name + self.jit_mode = False + self.build_for_cpu = False + self.enable_bf16 = False + self.error_log = None + + @abstractmethod + def absolute_name(self): + ''' + Returns absolute build path for cases where the op is pre-installed, e.g., apex.ops.adam.cpu_adam + will be installed as something like: apex/ops/adam/cpu_adam.so + ''' + pass + + @abstractmethod + def sources(self): + ''' + Returns list of source files for your op, relative to root of apex package + ''' + pass + + def hipify_extension(self): + pass + + def sycl_extension(self): + pass + + @staticmethod + def validate_torch_version(torch_info): + install_torch_version = torch_info['version'] + current_torch_version = ".".join(torch.__version__.split('.')[:2]) + if install_torch_version != current_torch_version: + raise RuntimeError("PyTorch version mismatch! apex ops were compiled and installed " + "with a different version than what is being used at runtime. " + f"Please re-install apex or switch torch versions. " + f"Install torch version={install_torch_version}, " + f"Runtime torch version={current_torch_version}") + + @staticmethod + def validate_torch_op_version(torch_info): + if not OpBuilder.is_rocm_pytorch(): + current_cuda_version = ".".join(torch.version.cuda.split('.')[:2]) + install_cuda_version = torch_info['cuda_version'] + if install_cuda_version != current_cuda_version: + raise RuntimeError("CUDA version mismatch! apex ops were compiled and installed " + "with a different version than what is being used at runtime. " + f"Please re-install apex or switch torch versions. " + f"Install CUDA version={install_cuda_version}, " + f"Runtime CUDA version={current_cuda_version}") + else: + current_hip_version = ".".join(torch.version.hip.split('.')[:2]) + install_hip_version = torch_info['hip_version'] + if install_hip_version != current_hip_version: + raise RuntimeError("HIP version mismatch! apex ops were compiled and installed " + "with a different version than what is being used at runtime. " + f"Please re-install apex or switch torch versions. " + f"Install HIP version={install_hip_version}, " + f"Runtime HIP version={current_hip_version}") + + @staticmethod + def is_rocm_pytorch(): + if OpBuilder._is_rocm_pytorch is not None: + return OpBuilder._is_rocm_pytorch + + _is_rocm_pytorch = False + try: + import torch + except ImportError: + pass + else: + if TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 5): + _is_rocm_pytorch = hasattr(torch.version, 'hip') and torch.version.hip is not None + if _is_rocm_pytorch: + from torch.utils.cpp_extension import ROCM_HOME + _is_rocm_pytorch = ROCM_HOME is not None + OpBuilder._is_rocm_pytorch = _is_rocm_pytorch + return OpBuilder._is_rocm_pytorch + + @staticmethod + def is_sycl_enabled(): + if OpBuilder._is_sycl_enabled is not None: + return OpBuilder._is_sycl_enabled + + _is_sycl_enabled = False + try: + result = subprocess.run(["c2s", "--version"], capture_output=True) + except: + pass + else: + _is_sycl_enabled = True + + OpBuilder._is_sycl_enabled = _is_sycl_enabled + return OpBuilder._is_sycl_enabled + + @staticmethod + def installed_rocm_version(): + if OpBuilder._rocm_version: + return OpBuilder._rocm_version + + ROCM_MAJOR = '0' + ROCM_MINOR = '0' + ROCM_VERSION_DEV_RAW = "" + if OpBuilder.is_rocm_pytorch(): + from torch.utils.cpp_extension import ROCM_HOME + rocm_ver_file = Path(ROCM_HOME).joinpath(".info/version") + if rocm_ver_file.is_file(): + with open(rocm_ver_file, 'r') as file: + ROCM_VERSION_DEV_RAW = file.read() + elif "rocm" in torch.__version__: + ROCM_VERSION_DEV_RAW = torch.__version__.split("rocm")[1] + if ROCM_VERSION_DEV_RAW != "": + ROCM_MAJOR = ROCM_VERSION_DEV_RAW.split('.')[0] + ROCM_MINOR = ROCM_VERSION_DEV_RAW.split('.')[1] + else: + # Look in /usr/include/rocm-version.h + rocm_ver_file = Path("/usr/include/rocm_version.h") + if rocm_ver_file.is_file(): + with open(rocm_ver_file, 'r') as file: + for ln in file.readlines(): + if "#define ROCM_VERSION_MAJOR" in ln: + ROCM_MAJOR = re.findall(r'\S+', ln)[2] + elif "#define ROCM_VERSION_MINOR" in ln: + ROCM_MINOR = re.findall(r'\S+', ln)[2] + if ROCM_MAJOR == '0': + assert False, "Could not detect ROCm version" + + OpBuilder._rocm_version = (int(ROCM_MAJOR), int(ROCM_MINOR)) + return OpBuilder._rocm_version + + @staticmethod + def get_rocm_gpu_arch(): + if OpBuilder._rocm_gpu_arch: + return OpBuilder._rocm_gpu_arch + rocm_info = Path("/opt/rocm/bin/rocminfo") + if (not rocm_info.is_file()): + rocm_info = Path("rocminfo") + rocm_gpu_arch_cmd = str(rocm_info) + " | grep -o -m 1 'gfx.*'" + try: + result = subprocess.check_output(rocm_gpu_arch_cmd, shell=True) + rocm_gpu_arch = result.decode('utf-8').strip() + except subprocess.CalledProcessError: + rocm_gpu_arch = "" + OpBuilder._rocm_gpu_arch = rocm_gpu_arch + return OpBuilder._rocm_gpu_arch + + @staticmethod + def get_rocm_wavefront_size(): + if OpBuilder._rocm_wavefront_size: + return OpBuilder._rocm_wavefront_size + + rocm_info = Path("/opt/rocm/bin/rocminfo") + if (not rocm_info.is_file()): + rocm_info = Path("rocminfo") + rocm_wavefront_size_cmd = str( + rocm_info) + " | grep -Eo -m1 'Wavefront Size:[[:space:]]+[0-9]+' | grep -Eo '[0-9]+'" + try: + result = subprocess.check_output(rocm_wavefront_size_cmd, shell=True) + rocm_wavefront_size = result.decode('utf-8').strip() + except subprocess.CalledProcessError: + rocm_wavefront_size = "32" + OpBuilder._rocm_wavefront_size = rocm_wavefront_size + return OpBuilder._rocm_wavefront_size + + def include_paths(self): + ''' + Returns list of include paths, relative to root of apex package + ''' + return [] + + def nvcc_args(self): + ''' + Returns optional list of compiler flags to forward to nvcc when building CUDA sources + ''' + return [] + + def cxx_args(self): + ''' + Returns optional list of compiler flags to forward to the build + ''' + return [] + + def is_compatible(self, verbose=False): + ''' + Check if all non-python dependencies are satisfied to build this op + ''' + return True + + def extra_ldflags(self): + return [] + + def has_function(self, funcname, libraries, library_dirs=None, verbose=False): + ''' + Test for existence of a function within a tuple of libraries. + + This is used as a smoke test to check whether a certain library is available. + As a test, this creates a simple C program that calls the specified function, + and then distutils is used to compile that program and link it with the specified libraries. + Returns True if both the compile and link are successful, False otherwise. + ''' + tempdir = None # we create a temporary directory to hold various files + filestderr = None # handle to open file to which we redirect stderr + oldstderr = None # file descriptor for stderr + try: + # Echo compile and link commands that are used. + if verbose: + distutils.log.set_verbosity(1) + + # Create a compiler object. + compiler = distutils.ccompiler.new_compiler(verbose=verbose) + + # Configure compiler and linker to build according to Python install. + distutils.sysconfig.customize_compiler(compiler) + + # Create a temporary directory to hold test files. + tempdir = tempfile.mkdtemp() + + # Define a simple C program that calls the function in question + prog = "void %s(void); int main(int argc, char** argv) { %s(); return 0; }" % (funcname, funcname) + + # Write the test program to a file. + filename = os.path.join(tempdir, 'test.c') + with open(filename, 'w') as f: + f.write(prog) + + # Redirect stderr file descriptor to a file to silence compile/link warnings. + if not verbose: + filestderr = open(os.path.join(tempdir, 'stderr.txt'), 'w') + oldstderr = os.dup(sys.stderr.fileno()) + os.dup2(filestderr.fileno(), sys.stderr.fileno()) + + # Workaround for behavior in distutils.ccompiler.CCompiler.object_filenames() + # Otherwise, a local directory will be used instead of tempdir + drive, driveless_filename = os.path.splitdrive(filename) + root_dir = driveless_filename[0] if os.path.isabs(driveless_filename) else '' + output_dir = os.path.join(drive, root_dir) + + # Attempt to compile the C program into an object file. + cflags = shlex.split(os.environ.get('CFLAGS', "")) + objs = compiler.compile([filename], output_dir=output_dir, extra_preargs=self.strip_empty_entries(cflags)) + + # Attempt to link the object file into an executable. + # Be sure to tack on any libraries that have been specified. + ldflags = shlex.split(os.environ.get('LDFLAGS', "")) + compiler.link_executable(objs, + os.path.join(tempdir, 'a.out'), + extra_preargs=self.strip_empty_entries(ldflags), + libraries=libraries, + library_dirs=library_dirs) + + # Compile and link succeeded + return True + + except CompileError: + return False + + except LinkError: + return False + + except: + return False + + finally: + # Restore stderr file descriptor and close the stderr redirect file. + if oldstderr is not None: + os.dup2(oldstderr, sys.stderr.fileno()) + if filestderr is not None: + filestderr.close() + + # Delete the temporary directory holding the test program and stderr files. + if tempdir is not None: + shutil.rmtree(tempdir) + + def strip_empty_entries(self, args): + ''' + Drop any empty strings from the list of compile and link flags + ''' + return [x for x in args if len(x) > 0] + + def cpu_arch(self): + try: + from cpuinfo import get_cpu_info + except ImportError as e: + cpu_info = self._backup_cpuinfo() + if cpu_info is None: + return "-march=native" + + try: + cpu_info = get_cpu_info() + except Exception as e: + self.warning(f"{self.name} attempted to use py-cpuinfo but failed (exception type: {type(e)}, {e}), " + "falling back to lscpu to get this information.") + cpu_info = self._backup_cpuinfo() + if cpu_info is None: + return "-march=native" + + if cpu_info['arch'].startswith('PPC_'): + # gcc does not provide -march on PowerPC, use -mcpu instead + return '-mcpu=native' + return '-march=native' + + def get_cuda_compile_flag(self): + try: + if not self.is_rocm_pytorch(): + assert_no_cuda_mismatch(self.name) + return "-D__ENABLE_CUDA__" + except MissingCUDAException: + print(f"{WARNING} {self.name} cuda is missing or is incompatible with installed torch, " + "only cpu ops can be compiled!") + return '-D__DISABLE_CUDA__' + return '-D__DISABLE_CUDA__' + + def _backup_cpuinfo(self): + # Construct cpu_info dict from lscpu that is similar to what py-cpuinfo provides + if not self.command_exists('lscpu'): + self.warning(f"{self.name} attempted to query 'lscpu' after failing to use py-cpuinfo " + "to detect the CPU architecture. 'lscpu' does not appear to exist on " + "your system, will fall back to use -march=native and non-vectorized execution.") + return None + result = subprocess.check_output(['lscpu']) + result = result.decode('utf-8').strip().lower() + + cpu_info = {} + cpu_info['arch'] = None + cpu_info['flags'] = "" + if 'genuineintel' in result or 'authenticamd' in result: + cpu_info['arch'] = 'X86_64' + if 'avx512' in result: + cpu_info['flags'] += 'avx512,' + elif 'avx512f' in result: + cpu_info['flags'] += 'avx512f,' + if 'avx2' in result: + cpu_info['flags'] += 'avx2' + elif 'ppc64le' in result: + cpu_info['arch'] = "PPC_" + + return cpu_info + + def simd_width(self): + try: + from cpuinfo import get_cpu_info + except ImportError as e: + cpu_info = self._backup_cpuinfo() + if cpu_info is None: + return '-D__SCALAR__' + + try: + cpu_info = get_cpu_info() + except Exception as e: + self.warning(f"{self.name} attempted to use py-cpuinfo but failed (exception type: {type(e)}, {e}), " + "falling back to lscpu to get this information.") + cpu_info = self._backup_cpuinfo() + if cpu_info is None: + return '-D__SCALAR__' + + if cpu_info['arch'] == 'X86_64': + if 'avx512' in cpu_info['flags'] or 'avx512f' in cpu_info['flags']: + return '-D__AVX512__' + elif 'avx2' in cpu_info['flags']: + return '-D__AVX256__' + return '-D__SCALAR__' + + def command_exists(self, cmd): + if '|' in cmd: + cmds = cmd.split("|") + else: + cmds = [cmd] + valid = False + for cmd in cmds: + safe_cmd = ["bash", "-c", f"type {cmd}"] + result = subprocess.Popen(safe_cmd, stdout=subprocess.PIPE) + valid = valid or result.wait() == 0 + + if not valid and len(cmds) > 1: + print(f"{WARNING} {self.name} requires one of the following commands '{cmds}', but it does not exist!") + elif not valid and len(cmds) == 1: + print(f"{WARNING} {self.name} requires the '{cmd}' command, but it does not exist!") + return valid + + def warning(self, msg): + self.error_log = f"{msg}" + print(f"{WARNING} {msg}") + + def apex_src_path(self, code_path): + if os.path.isabs(code_path): + return code_path + else: + return os.path.join(Path(__file__).parent.parent.absolute(), code_path) + + def builder(self): + from torch.utils.cpp_extension import CppExtension + include_dirs = [os.path.abspath(x) for x in self.strip_empty_entries(self.include_paths())] + return CppExtension(name=self.absolute_name(), + sources=self.strip_empty_entries(self.sources()), + include_dirs=include_dirs, + extra_compile_args={'cxx': self.strip_empty_entries(self.cxx_args())}, + extra_link_args=self.strip_empty_entries(self.extra_ldflags())) + + def load(self, verbose=True): + if self.name in __class__._loaded_ops: + return __class__._loaded_ops[self.name] + + from apex.git_version_info import installed_ops, torch_info + if installed_ops.get(self.name, False): + # Ensure the op we're about to load was compiled with the same + # torch/cuda versions we are currently using at runtime. + self.validate_torch_version(torch_info) + if torch.cuda.is_available() and isinstance(self, CUDAOpBuilder): + self.validate_torch_op_version(torch_info) + + op_module = importlib.import_module(self.absolute_name()) + __class__._loaded_ops[self.name] = op_module + return op_module + else: + return self.jit_load(verbose) + + def jit_load(self, verbose=True): + if not self.is_compatible(verbose): + raise RuntimeError( + f"Unable to JIT load the {self.name} op due to it not being compatible due to hardware/software issue. {self.error_log}" + ) + try: + import ninja # noqa: F401 # type: ignore + except ImportError: + raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.") + + if isinstance(self, CUDAOpBuilder) and not self.is_rocm_pytorch(): + self.build_for_cpu = not torch.cuda.is_available() + + self.jit_mode = True + from torch.utils.cpp_extension import load + + start_build = time.time() + sources = [os.path.abspath(self.apex_src_path(path)) for path in self.sources()] + extra_include_paths = [os.path.abspath(self.apex_src_path(path)) for path in self.include_paths()] + + # Torch will try and apply whatever CCs are in the arch list at compile time, + # we have already set the intended targets ourselves we know that will be + # needed at runtime. This prevents CC collisions such as multiple __half + # implementations. Stash arch list to reset after build. + torch_arch_list = None + if "TORCH_CUDA_ARCH_LIST" in os.environ: + torch_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST") + os.environ["TORCH_CUDA_ARCH_LIST"] = "" + + nvcc_args = self.strip_empty_entries(self.nvcc_args()) + cxx_args = self.strip_empty_entries(self.cxx_args()) + + cxx_args.append("-UC10_USE_GLOG") + nvcc_args.append("-UC10_USE_GLOG") + if isinstance(self, CUDAOpBuilder): + if not self.build_for_cpu and self.enable_bf16: + cxx_args.append("-DBF16_AVAILABLE") + nvcc_args.append("-DBF16_AVAILABLE") + nvcc_args.append("-U__CUDA_NO_BFLOAT16_OPERATORS__") + nvcc_args.append("-U__CUDA_NO_BFLOAT162_OPERATORS__") + nvcc_args.append("-U__CUDA_NO_BFLOAT16_CONVERSIONS__") + + if self.is_rocm_pytorch(): + cxx_args.append("-D__HIP_PLATFORM_AMD__=1") + os.environ["PYTORCH_ROCM_ARCH"] = self.get_rocm_gpu_arch() + cxx_args.append('-DROCM_WAVEFRONT_SIZE=%s' % self.get_rocm_wavefront_size()) + + op_module = load(name=self.name, + sources=self.strip_empty_entries(sources), + extra_include_paths=self.strip_empty_entries(extra_include_paths), + extra_cflags=cxx_args, + extra_cuda_cflags=nvcc_args, + extra_ldflags=self.strip_empty_entries(self.extra_ldflags()), + with_cuda=True if (isinstance(self, CUDAOpBuilder) and not self.build_for_cpu) else None, + verbose=verbose) + + build_duration = time.time() - start_build + if verbose: + print(f"Time to load {self.name} op: {build_duration} seconds") + + # Reset arch list so we are not silently removing it for other possible use cases + if torch_arch_list: + os.environ["TORCH_CUDA_ARCH_LIST"] = torch_arch_list + + __class__._loaded_ops[self.name] = op_module + + return op_module + + +class CUDAOpBuilder(OpBuilder): + + def compute_capability_args(self, cross_compile_archs=None): + """ + Returns nvcc compute capability compile flags. + + 1. `TORCH_CUDA_ARCH_LIST` takes priority over `cross_compile_archs`. + 2. If neither is set default compute capabilities will be used + 3. Under `jit_mode` compute capabilities of all visible cards will be used plus PTX + + Format: + + - `TORCH_CUDA_ARCH_LIST` may use ; or whitespace separators. Examples: + + TORCH_CUDA_ARCH_LIST="6.1;7.5;8.6;9.0;10.0" pip install ... + TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6 9.0 10.0+PTX" pip install ... + + - `cross_compile_archs` uses ; separator. + + """ + ccs = [] + if self.jit_mode: + # Compile for underlying architectures since we know those at runtime + for i in range(torch.cuda.device_count()): + CC_MAJOR, CC_MINOR = torch.cuda.get_device_capability(i) + cc = f"{CC_MAJOR}.{CC_MINOR}" + if cc not in ccs: + ccs.append(cc) + ccs = sorted(ccs) + ccs[-1] += '+PTX' + else: + # Cross-compile mode, compile for various architectures + # env override takes priority + cross_compile_archs_env = os.environ.get('TORCH_CUDA_ARCH_LIST', None) + if cross_compile_archs_env is not None: + if cross_compile_archs is not None: + print( + f"{WARNING} env var TORCH_CUDA_ARCH_LIST={cross_compile_archs_env} overrides cross_compile_archs={cross_compile_archs}" + ) + cross_compile_archs = cross_compile_archs_env.replace(' ', ';') + else: + if cross_compile_archs is None: + cross_compile_archs = get_default_compute_capabilities() + ccs = cross_compile_archs.split(';') + + ccs = self.filter_ccs(ccs) + if len(ccs) == 0: + raise RuntimeError( + f"Unable to load {self.name} op due to no compute capabilities remaining after filtering") + + args = [] + self.enable_bf16 = True + for cc in ccs: + num = cc[0] + cc[1].split('+')[0] + args.append(f'-gencode=arch=compute_{num},code=sm_{num}') + if cc[1].endswith('+PTX'): + args.append(f'-gencode=arch=compute_{num},code=compute_{num}') + + if int(cc[0]) <= 7: + self.enable_bf16 = False + + return args + + def filter_ccs(self, ccs: List[str]): + """ + Prune any compute capabilities that are not compatible with the builder. Should log + which CCs have been pruned. + """ + return [cc.split('.') for cc in ccs] + + def version_dependent_macros(self): + # Fix from apex that might be relevant for us as well, related to https://github.com/NVIDIA/apex/issues/456 + version_ge_1_1 = [] + if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0): + version_ge_1_1 = ['-DVERSION_GE_1_1'] + version_ge_1_3 = [] + if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2): + version_ge_1_3 = ['-DVERSION_GE_1_3'] + version_ge_1_5 = [] + if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4): + version_ge_1_5 = ['-DVERSION_GE_1_5'] + + version_dependent_macro_args = version_ge_1_1 + version_ge_1_3 + version_ge_1_5 + if self.is_rocm_pytorch() and (self.torch_version()[0] >= 6): + version_dependent_macro_args += ["-DHIPBLAS_V2"] + + return version_dependent_macro_args + + def is_compatible(self, verbose=False): + return super().is_compatible(verbose) + + def builder(self): + try: + if not self.is_rocm_pytorch(): + assert_no_cuda_mismatch(self.name) + self.build_for_cpu = False + except MissingCUDAException: + self.build_for_cpu = True + + if self.build_for_cpu: + from torch.utils.cpp_extension import CppExtension as ExtensionBuilder + else: + from torch.utils.cpp_extension import CUDAExtension as ExtensionBuilder + include_dirs = [os.path.abspath(x) for x in self.strip_empty_entries(self.include_paths())] + compile_args = {'cxx': self.strip_empty_entries(self.cxx_args())} if self.build_for_cpu else \ + {'cxx': self.strip_empty_entries(self.cxx_args()), \ + 'nvcc': self.strip_empty_entries(self.nvcc_args())} + + if not self.build_for_cpu and self.enable_bf16: + compile_args['cxx'].append("-DBF16_AVAILABLE") + compile_args['nvcc'].append("-DBF16_AVAILABLE") + + if self.is_rocm_pytorch(): + compile_args['cxx'].append("-D__HIP_PLATFORM_AMD__=1") + #cxx compiler args are required to compile cpp files + compile_args['cxx'].append('-DROCM_WAVEFRONT_SIZE=%s' % self.get_rocm_wavefront_size()) + #nvcc compiler args are required to compile hip files + compile_args['nvcc'].append('-DROCM_WAVEFRONT_SIZE=%s' % self.get_rocm_wavefront_size()) + if self.get_rocm_gpu_arch(): + os.environ["PYTORCH_ROCM_ARCH"] = self.get_rocm_gpu_arch() + + cuda_ext = ExtensionBuilder(name=self.absolute_name(), + sources=self.strip_empty_entries(self.sources()), + include_dirs=include_dirs, + libraries=self.strip_empty_entries(self.libraries_args()), + extra_compile_args=compile_args, + extra_link_args=self.strip_empty_entries(self.extra_ldflags())) + + if self.is_rocm_pytorch(): + # hip converts paths to absolute, this converts back to relative + sources = cuda_ext.sources + curr_file = Path(__file__).parent.parent # ds root + for i in range(len(sources)): + src = Path(sources[i]) + if src.is_absolute(): + sources[i] = str(src.relative_to(curr_file)) + else: + sources[i] = str(src) + cuda_ext.sources = sources + return cuda_ext + + def hipify_extension(self): + if self.is_rocm_pytorch(): + from torch.utils.hipify import hipify_python + hipify_python.hipify( + project_directory=os.getcwd(), + output_directory=os.getcwd(), + header_include_dirs=self.include_paths(), + includes=[os.path.join(os.getcwd(), '*')], + extra_files=[os.path.abspath(s) for s in self.sources()], + show_detailed=True, + is_pytorch_extension=True, + hipify_extra_files_only=True, + ) + + def cxx_args(self): + if sys.platform == "win32": + return ['-O2'] + else: + return ['-O3', '-std=c++17', '-g', '-Wno-reorder'] + + def nvcc_args(self): + if self.build_for_cpu: + return [] + args = ['-O3'] + if self.is_rocm_pytorch(): + ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version() + args += [ + '-std=c++17', '-U__HIP_NO_HALF_OPERATORS__', '-U__HIP_NO_HALF_CONVERSIONS__', + '-U__HIP_NO_HALF2_OPERATORS__', + '-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR, + '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR + ] + else: + try: + nvcc_threads = int(os.getenv("APEX_NVCC_THREADS", "")) + if nvcc_threads <= 0: + raise ValueError("") + except ValueError: + nvcc_threads = min(os.cpu_count(), 8) + + cuda_major, cuda_minor = installed_cuda_version() + if cuda_major > 10: + if cuda_major == 12 and cuda_minor >= 5: + std_lib = '-std=c++20' + else: + std_lib = '-std=c++17' + else: + std_lib = '-std=c++14' + args += [ + '-allow-unsupported-compiler' if sys.platform == "win32" else '', '--use_fast_math', std_lib, + '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__', + f'--threads={nvcc_threads}' + ] + if os.environ.get('APEX_DEBUG_CUDA_BUILD', '0') == '1': + args.append('--ptxas-options=-v') + args += self.compute_capability_args() + return args + + def libraries_args(self): + if self.build_for_cpu: + return [] + + if sys.platform == "win32": + return ['cublas', 'curand'] + else: + return [] + + def backward_pass_guard_args(self): + torch_dir = torch.__path__[0] + context_file = os.path.join(torch_dir, "include", "ATen", "Context.h") + if os.path.exists(context_file): + lines = open(context_file, 'r').readlines() + found_Backward_Pass_Guard = False + found_ROCmBackward_Pass_Guard = False + for line in lines: + if "BackwardPassGuard" in line: + # BackwardPassGuard has been renamed to ROCmBackwardPassGuard + # https://github.com/pytorch/pytorch/pull/71881/commits/4b82f5a67a35406ffb5691c69e6b4c9086316a43 + if "ROCmBackwardPassGuard" in line: + found_ROCmBackward_Pass_Guard = True + else: + found_Backward_Pass_Guard = True + break + backward_pass_guard_args = [] + if found_Backward_Pass_Guard: + backward_pass_guard_args += ['-DBACKWARD_PASS_GUARD'] + ['-DBACKWARD_PASS_GUARD_CLASS=BackwardPassGuard'] + if found_ROCmBackward_Pass_Guard: + backward_pass_guard_args += ['-DBACKWARD_PASS_GUARD'] + ['-DBACKWARD_PASS_GUARD_CLASS=ROCmBackwardPassGuard'] + return backward_pass_guard_args + + def aten_atomic_args(self): + torch_dir = torch.__path__[0] + if os.path.exists(os.path.join(torch_dir, "include", "ATen", "Atomic.cuh")): + return ['-DATEN_ATOMIC_HEADER'] + else: + return [] + + def generator_args(self): + generator_flag = [] + torch_dir = torch.__path__[0] + if os.path.exists(os.path.join(torch_dir, "include", "ATen", "CUDAGeneratorImpl.h")): + generator_flag = ["-DOLD_GENERATOR_PATH"] + return generator_flag + + def nvcc_threads_args(self): + cuda_major, cuda_minor = installed_cuda_version() + if cuda_major >= 11 and cuda_minor >= 2: + return ["--threads", "4"] + return [] + + def nccl_args(self): + nccl_library = ["-lnccl"] + if self.is_rocm_pytorch(): + nccl_library = ["-lrccl"] + return nccl_library + + def nccl_version(self): + return torch.cuda.nccl.version()[0:2] + + def torch_version(self): + return (TORCH_MAJOR, TORCH_MINOR) + + def is_supported(self): + return super().is_supported() + +class CPUOpBuilder(CUDAOpBuilder): + + def get_cuda_lib64_path(self): + import torch + if not self.is_rocm_pytorch(): + CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64") + if not os.path.exists(CUDA_LIB64): + CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib") + else: + CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.ROCM_HOME, "lib") + return CUDA_LIB64 + + def extra_ldflags(self): + if self.build_for_cpu: + return ['-fopenmp'] + + if not self.is_rocm_pytorch(): + ld_flags = ['-lcurand'] + if not self.build_for_cpu: + ld_flags.append(f'-L{self.get_cuda_lib64_path()}') + return ld_flags + + return [] + + def cxx_args(self): + args = [] + if not self.build_for_cpu: + CUDA_LIB64 = self.get_cuda_lib64_path() + + args += super().cxx_args() + args += [ + f'-L{CUDA_LIB64}', + '-lcudart', + '-lcublas', + '-g', + ] + + CPU_ARCH = self.cpu_arch() + SIMD_WIDTH = self.simd_width() + CUDA_ENABLE = self.get_cuda_compile_flag() + args += [ + CPU_ARCH, + '-fopenmp', + SIMD_WIDTH, + CUDA_ENABLE, + ] + + return args \ No newline at end of file diff --git a/op_builder/distributed_adam.py b/op_builder/distributed_adam.py new file mode 100644 index 000000000..ef453bee9 --- /dev/null +++ b/op_builder/distributed_adam.py @@ -0,0 +1,33 @@ +from .builder import CUDAOpBuilder + +import sys + + +class DistributedAdamBuilder(CUDAOpBuilder): + BUILD_VAR = 'APEX_BUILD_DISTRIBUTED_ADAM' + INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS" + NAME = "distributed_adam_cuda" + + def __init__(self): + super().__init__(name=self.NAME) + + def absolute_name(self): + return f'apex.{self.NAME}' + + def sources(self): + return ['contrib/csrc/optimizers/multi_tensor_distopt_adam.cpp', + 'contrib/csrc/optimizers/multi_tensor_distopt_adam_kernel.cu'] + + def include_paths(self): + return ['contrib/csrc/', + 'csrc'] + + def cxx_args(self): + args = super().cxx_args() + return args + self.version_dependent_macros() + + def nvcc_args(self): + nvcc_flags = ['-O3'] + self.version_dependent_macros() + if not self.is_rocm_pytorch(): + nvcc_flags += ['--use_fast_math'] + return nvcc_flags \ No newline at end of file diff --git a/op_builder/distributed_lamb.py b/op_builder/distributed_lamb.py new file mode 100644 index 000000000..74d77d129 --- /dev/null +++ b/op_builder/distributed_lamb.py @@ -0,0 +1,33 @@ +from .builder import CUDAOpBuilder + +import sys + + +class DistributedLambBuilder(CUDAOpBuilder): + BUILD_VAR = 'APEX_BUILD_DISTRIBUTED_LAMB' + INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS" + NAME = "distributed_lamb_cuda" + + def __init__(self): + super().__init__(name=self.NAME) + + def absolute_name(self): + return f'apex.{self.NAME}' + + def sources(self): + return ['contrib/csrc/optimizers/multi_tensor_distopt_lamb.cpp', + 'contrib/csrc/optimizers/multi_tensor_distopt_lamb_kernel.cu'] + + def include_paths(self): + return ['contrib/csrc/', + 'csrc'] + + def cxx_args(self): + args = super().cxx_args() + return args + self.version_dependent_macros() + + def nvcc_args(self): + nvcc_flags = ['-O3'] + self.version_dependent_macros() + if not self.is_rocm_pytorch(): + nvcc_flags += ['--use_fast_math'] + return nvcc_flags \ No newline at end of file diff --git a/op_builder/fast_multihead_attn.py b/op_builder/fast_multihead_attn.py new file mode 100644 index 000000000..0f2f8b52f --- /dev/null +++ b/op_builder/fast_multihead_attn.py @@ -0,0 +1,50 @@ +from .builder import CUDAOpBuilder + +import sys + + +class FastMultiheadAttnBuilder(CUDAOpBuilder): + BUILD_VAR = 'APEX_BUILD_FAST_MULTIHEAD_ATTN' + INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS" + NAME = "fast_multihead_attn" + + def __init__(self): + super().__init__(name=self.NAME) + + def absolute_name(self): + return f'apex.{self.NAME}' + + def sources(self): + return ['contrib/csrc/multihead_attn/multihead_attn_frontend.cpp', + 'contrib/csrc/multihead_attn/additive_masked_softmax_dropout_cuda.cu', + "contrib/csrc/multihead_attn/masked_softmax_dropout_cuda.cu", + "contrib/csrc/multihead_attn/encdec_multihead_attn_cuda.cu", + "contrib/csrc/multihead_attn/encdec_multihead_attn_norm_add_cuda.cu", + "contrib/csrc/multihead_attn/self_multihead_attn_cuda.cu", + "contrib/csrc/multihead_attn/self_multihead_attn_bias_additive_mask_cuda.cu", + "contrib/csrc/multihead_attn/self_multihead_attn_bias_cuda.cu", + "contrib/csrc/multihead_attn/self_multihead_attn_norm_add_cuda.cu"] + + def include_paths(self): + return ['csrc/', + 'contrib/csrc/', + 'contrib/csrc/multihead_attn'] + + def cxx_args(self): + args = super().cxx_args() + return args + self.version_dependent_macros() + self.generator_args() + + def nvcc_args(self): + nvcc_flags = ['-O3'] + self.version_dependent_macros() + self.generator_args() + if not self.is_rocm_pytorch(): + nvcc_flags += ['-U__CUDA_NO_HALF_OPERATORS__', + '-U__CUDA_NO_HALF_CONVERSIONS__', + '--expt-relaxed-constexpr', + '--expt-extended-lambda', + '--use_fast_math'] + self.compute_capability_args() + else: + nvcc_flags += ['-I/opt/rocm/include/hiprand', + '-I/opt/rocm/include/rocrand', + '-U__HIP_NO_HALF_OPERATORS__', + '-U__HIP_NO_HALF_CONVERSIONS__'] + self.backward_pass_guard_args() + return nvcc_flags \ No newline at end of file diff --git a/op_builder/focal_loss.py b/op_builder/focal_loss.py new file mode 100644 index 000000000..98a21330a --- /dev/null +++ b/op_builder/focal_loss.py @@ -0,0 +1,33 @@ +from .builder import CUDAOpBuilder + +import sys + + +class FocalLossBuilder(CUDAOpBuilder): + BUILD_VAR = 'APEX_BUILD_FOCAL_LOSS' + INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS" + NAME = "focal_loss_cuda" + + def __init__(self): + super().__init__(name=self.NAME) + + def absolute_name(self): + return f'apex.{self.NAME}' + + def sources(self): + return ['contrib/csrc/focal_loss/focal_loss_cuda.cpp', + 'contrib/csrc/focal_loss/focal_loss_cuda_kernel.cu'] + + def include_paths(self): + return ['contrib/csrc/' ] + + def cxx_args(self): + args = super().cxx_args() + return args + self.version_dependent_macros() + + def nvcc_args(self): + if self.is_rocm_pytorch(): + nvcc_flags = ['-O3'] + self.version_dependent_macros() + else: + nvcc_flags = ['-O3', '--ftz=false', '--use_fast_math'] + return nvcc_flags \ No newline at end of file diff --git a/op_builder/fused_adam.py b/op_builder/fused_adam.py new file mode 100644 index 000000000..f335368d8 --- /dev/null +++ b/op_builder/fused_adam.py @@ -0,0 +1,33 @@ +from .builder import CUDAOpBuilder + +import sys + + +class FusedAdamBuilder(CUDAOpBuilder): + BUILD_VAR = 'APEX_BUILD_FUSED_ADAM' + INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS" + NAME = "fused_adam_cuda" + + def __init__(self): + super().__init__(name=self.NAME) + + def absolute_name(self): + return f'apex.{self.NAME}' + + def sources(self): + return ['contrib/csrc/optimizers/fused_adam_cuda.cpp', + 'contrib/csrc/optimizers/fused_adam_cuda_kernel.cu'] + + def include_paths(self): + return ['contrib/csrc/', + 'csrc'] + + def cxx_args(self): + args = super().cxx_args() + return args + self.version_dependent_macros() + + def nvcc_args(self): + nvcc_flags = ['-O3'] + self.version_dependent_macros() + if not self.is_rocm_pytorch(): + nvcc_flags += ['--use_fast_math'] + return nvcc_flags \ No newline at end of file diff --git a/op_builder/fused_bias_swiglu.py b/op_builder/fused_bias_swiglu.py new file mode 100644 index 000000000..4a7d13881 --- /dev/null +++ b/op_builder/fused_bias_swiglu.py @@ -0,0 +1,57 @@ +from .builder import CUDAOpBuilder +import sys +import os + +class FusedBiasSwiGLUBuilder(CUDAOpBuilder): + BUILD_VAR = 'APEX_BUILD_FUSED_BIAS_SWIGLU' + INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS" + NAME = "fused_bias_swiglu" + + def __init__(self): + super().__init__(name=self.NAME) + + def absolute_name(self): + return f'apex.{self.NAME}' + + def sources(self): + return [ + "csrc/megatron/fused_bias_swiglu.cpp", + "csrc/megatron/fused_bias_swiglu_cuda.cu" + ] + + def include_paths(self): + return ['csrc'] + + def cxx_args(self): + args = super().cxx_args() + return args + self.version_dependent_macros() + + def nvcc_args(self): + nvcc_flags = [ + '-O3', + '-U__CUDA_NO_HALF_OPERATORS__', + '-U__CUDA_NO_HALF_CONVERSIONS__' + ] + self.version_dependent_macros() + if not self.is_rocm_pytorch(): + nvcc_flags.extend( + [ + '--expt-relaxed-constexpr', + '--expt-extended-lambda' + ]) + else: + # Handle ROCm arch flags + amdgpu_targets = os.environ.get('PYTORCH_ROCM_ARCH', '') + if not amdgpu_targets: + print("Warning: PYTORCH_ROCM_ARCH environment variable is empty.") + print("Using default architecture. Set this variable for specific GPU targets.") + print("Example: export PYTORCH_ROCM_ARCH=gfx906") + amdgpu_targets = "gfx906" + try: + for amdgpu_target in amdgpu_targets.split(';'): + if amdgpu_target: + nvcc_flags += [f'--offload-arch={amdgpu_target}'] + except Exception as e: + print(f"Warning: Error processing PYTORCH_ROCM_ARCH: {e}") + print("Falling back to default architecture gfx906") + nvcc_flags += ['--offload-arch=gfx906'] + return nvcc_flags \ No newline at end of file diff --git a/op_builder/fused_dense.py b/op_builder/fused_dense.py new file mode 100644 index 000000000..4d40eef6d --- /dev/null +++ b/op_builder/fused_dense.py @@ -0,0 +1,28 @@ +from .builder import CUDAOpBuilder + +import sys + + +class FusedDenseBuilder(CUDAOpBuilder): + BUILD_VAR = 'APEX_BUILD_FUSED_DENSE' + INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS" + NAME = "fused_dense_cuda" + + def __init__(self): + super().__init__(name=self.NAME) + + def absolute_name(self): + return f'apex.{self.NAME}' + + def sources(self): + return ['csrc/fused_dense_base.cpp', 'csrc/fused_dense_cuda.cu'] + + def include_paths(self): + return ['csrc'] + + def cxx_args(self): + args = super().cxx_args() + return args + self.version_dependent_macros() + + def nvcc_args(self): + return ['-O3'] + self.version_dependent_macros() \ No newline at end of file diff --git a/op_builder/fused_index_mul_2d.py b/op_builder/fused_index_mul_2d.py new file mode 100644 index 000000000..d04564e15 --- /dev/null +++ b/op_builder/fused_index_mul_2d.py @@ -0,0 +1,34 @@ +from .builder import CUDAOpBuilder + +import sys + + +class FusedIndexMul2dBuilder(CUDAOpBuilder): + BUILD_VAR = 'APEX_BUILD_FUSED_INDEX_MUL_2D' + INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS" + NAME = "fused_index_mul_2d" + + def __init__(self): + super().__init__(name=self.NAME) + + def absolute_name(self): + return f'apex.{self.NAME}' + + def sources(self): + return ['contrib/csrc/index_mul_2d/index_mul_2d_cuda.cpp', + 'contrib/csrc/index_mul_2d/index_mul_2d_cuda_kernel.cu'] + + def include_paths(self): + return ['contrib/csrc/'] + + def cxx_args(self): + args = super().cxx_args() + return args + self.version_dependent_macros() + + def nvcc_args(self): + nvcc_flags = ['-O3'] + self.version_dependent_macros() + if not self.is_rocm_pytorch(): + nvcc_flags += ['--use_fast_math', '--ftz=false'] + else: + nvcc_flags += self.aten_atomic_args() + return nvcc_flags \ No newline at end of file diff --git a/op_builder/fused_lamb.py b/op_builder/fused_lamb.py new file mode 100644 index 000000000..02a0b6fe7 --- /dev/null +++ b/op_builder/fused_lamb.py @@ -0,0 +1,34 @@ +from .builder import CUDAOpBuilder + +import sys + + +class FusedLambBuilder(CUDAOpBuilder): + BUILD_VAR = 'APEX_BUILD_FUSED_LAMB' + INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS" + NAME = "fused_lamb_cuda" + + def __init__(self): + super().__init__(name=self.NAME) + + def absolute_name(self): + return f'apex.{self.NAME}' + + def sources(self): + return ['contrib/csrc/optimizers/fused_lamb_cuda.cpp', + 'contrib/csrc/optimizers/fused_lamb_cuda_kernel.cu', + 'csrc/multi_tensor_l2norm_kernel.cu'] + + def include_paths(self): + return ['contrib/csrc/', + 'csrc'] + + def cxx_args(self): + args = super().cxx_args() + return args + self.version_dependent_macros() + + def nvcc_args(self): + nvcc_flags = ['-O3'] + self.version_dependent_macros() + if not self.is_rocm_pytorch(): + nvcc_flags += ['--use_fast_math'] + return nvcc_flags \ No newline at end of file diff --git a/op_builder/fused_layer_norm.py b/op_builder/fused_layer_norm.py new file mode 100644 index 000000000..66130f17b --- /dev/null +++ b/op_builder/fused_layer_norm.py @@ -0,0 +1,31 @@ +from .builder import CUDAOpBuilder + +import sys + + +class FusedLayerNormBuilder(CUDAOpBuilder): + BUILD_VAR = 'APEX_BUILD_FUSED_LAYER_NORM' + INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS" + NAME = "fused_layer_norm_cuda" + + def __init__(self): + super().__init__(name=self.NAME) + + def absolute_name(self): + return f'apex.{self.NAME}' + + def sources(self): + return ['csrc/layer_norm_cuda.cpp', 'csrc/layer_norm_cuda_kernel.cu'] + + def include_paths(self): + return ['csrc'] + + def cxx_args(self): + args = super().cxx_args() + return args + self.version_dependent_macros() + + def nvcc_args(self): + nvcc_flags = ['-O3'] + self.version_dependent_macros() + if not self.is_rocm_pytorch(): + nvcc_flags.extend(['--use_fast_math', '-maxrregcount=50']) + return nvcc_flags \ No newline at end of file diff --git a/op_builder/fused_rope.py b/op_builder/fused_rope.py new file mode 100644 index 000000000..c87f14b84 --- /dev/null +++ b/op_builder/fused_rope.py @@ -0,0 +1,40 @@ +from .builder import CUDAOpBuilder + +import sys + + +class FusedRopeBuilder(CUDAOpBuilder): + BUILD_VAR = 'APEX_BUILD_FUSED_ROPE' + INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS" + NAME = "fused_rotary_positional_embedding" + + def __init__(self): + super().__init__(name=self.NAME) + + def absolute_name(self): + return f'apex.{self.NAME}' + + def sources(self): + return ["csrc/megatron/fused_rotary_positional_embedding.cpp", + "csrc/megatron/fused_rotary_positional_embedding_cuda.cu"] + + def include_paths(self): + return ['csrc', 'csrc/megatron'] + + def cxx_args(self): + args = super().cxx_args() + return args + self.version_dependent_macros() + + def nvcc_args(self): + nvcc_flags = [ + '-O3', + '-U__CUDA_NO_HALF_OPERATORS__', + '-U__CUDA_NO_HALF_CONVERSIONS__' + ] + self.version_dependent_macros() + if not self.is_rocm_pytorch(): + nvcc_flags.extend( + [ + '--expt-relaxed-constexpr', + '--expt-extended-lambda' + ]) + return nvcc_flags \ No newline at end of file diff --git a/op_builder/fused_weight_gradient_mlp.py b/op_builder/fused_weight_gradient_mlp.py new file mode 100644 index 000000000..b6d595385 --- /dev/null +++ b/op_builder/fused_weight_gradient_mlp.py @@ -0,0 +1,42 @@ +from .builder import CUDAOpBuilder + +class FusedWeightGradientMlpCudaBuilder(CUDAOpBuilder): + BUILD_VAR = 'APEX_BUILD_FUSED_WEIGHT_GRADIENT_MLP' + INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS" + NAME = "fused_weight_gradient_mlp_cuda" + + def __init__(self): + super().__init__(name=self.NAME) + + def absolute_name(self): + return f'apex.{self.NAME}' + + def sources(self): + return [ + "csrc/megatron/fused_weight_gradient_dense.cpp", + "csrc/megatron/fused_weight_gradient_dense_cuda.cu", + "csrc/megatron/fused_weight_gradient_dense_16bit_prec_cuda.cu", + ] + + def include_paths(self): + # Both csrc and csrc/megatron are included in the original extension + return ['csrc', 'csrc/megatron'] + + def cxx_args(self): + args = super().cxx_args() + return args + self.version_dependent_macros() + + def nvcc_args(self): + nvcc_flags = [ + '-O3', + '-U__CUDA_NO_HALF_OPERATORS__', + '-U__CUDA_NO_HALF_CONVERSIONS__' + ] + self.version_dependent_macros() + if not self.is_rocm_pytorch(): + nvcc_flags.extend( + [ + '--expt-relaxed-constexpr', + '--expt-extended-lambda', + "--use_fast_math" + ]) + self.compute_capability_args() + return nvcc_flags \ No newline at end of file diff --git a/op_builder/generic_scaled_masked_softmax_cuda.py b/op_builder/generic_scaled_masked_softmax_cuda.py new file mode 100644 index 000000000..a0fb2d5fc --- /dev/null +++ b/op_builder/generic_scaled_masked_softmax_cuda.py @@ -0,0 +1,39 @@ +from .builder import CUDAOpBuilder + +class GenericScaledMaskedSoftmaxCudaBuilder(CUDAOpBuilder): + BUILD_VAR = 'APEX_BUILD_GENERIC_SCALED_MASKED_SOFTMAX_CUDA' + INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS" + NAME = "generic_scaled_masked_softmax_cuda" + + def __init__(self): + super().__init__(name=self.NAME) + + def absolute_name(self): + return f'apex.{self.NAME}' + + def sources(self): + return [ + "csrc/megatron/generic_scaled_masked_softmax_cpu.cpp", + "csrc/megatron/generic_scaled_masked_softmax_cuda.cu" + ] + + def include_paths(self): + return ['csrc'] + + def cxx_args(self): + args = super().cxx_args() + return args + self.version_dependent_macros() + + def nvcc_args(self): + nvcc_flags = [ + '-O3', + '-U__CUDA_NO_HALF_OPERATORS__', + '-U__CUDA_NO_HALF_CONVERSIONS__' + ] + self.version_dependent_macros() + if not self.is_rocm_pytorch(): + nvcc_flags.extend( + [ + '--expt-relaxed-constexpr', + '--expt-extended-lambda' + ]) + return nvcc_flags \ No newline at end of file diff --git a/op_builder/mlp.py b/op_builder/mlp.py new file mode 100644 index 000000000..c6a177721 --- /dev/null +++ b/op_builder/mlp.py @@ -0,0 +1,32 @@ +from .builder import CUDAOpBuilder + +import sys + + +class MlpBuilder(CUDAOpBuilder): + BUILD_VAR = 'APEX_BUILD_MLP' + INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS" + NAME = "mlp_cuda" + + def __init__(self): + super().__init__(name=self.NAME) + + def absolute_name(self): + return f'apex.{self.NAME}' + + def sources(self): + return ['csrc/mlp.cpp', + 'csrc/mlp_cuda.cu'] + + def include_paths(self): + return ['csrc'] + + def cxx_args(self): + args = super().cxx_args() + return args + self.version_dependent_macros() + + def nvcc_args(self): + nvcc_flags = ['-O3'] + self.version_dependent_macros() + if self.is_rocm_pytorch(): + nvcc_flags.extend(self.backward_pass_guard_args()) + return nvcc_flags \ No newline at end of file diff --git a/op_builder/nccl_allocator.py b/op_builder/nccl_allocator.py new file mode 100644 index 000000000..320e76476 --- /dev/null +++ b/op_builder/nccl_allocator.py @@ -0,0 +1,36 @@ +from .builder import CUDAOpBuilder + +import sys + + +class NCCLAllocatorBuilder(CUDAOpBuilder): + BUILD_VAR = 'APEX_BUILD_NCCL_ALLOCATOR' + INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS" + NAME = "_apex_nccl_allocator" + + def __init__(self): + super().__init__(name=self.NAME) + + def absolute_name(self): + return f'apex.{self.NAME}' + + def sources(self): + return ["contrib/csrc/nccl_allocator/NCCLAllocator.cpp"] + + def include_paths(self): + return ['contrib/csrc/'] + + def cxx_args(self): + args = super().cxx_args() + return args + self.version_dependent_macros() + self.generator_args() + + def nvcc_args(self): + return self.nccl_args() + + def is_compatible(self, verbose=False): + torch_version = self.torch_version() + if torch_version >= (2, 6): + available_nccl_version = self.nccl_version() + if available_nccl_version >= (2, 19): + return True + return False \ No newline at end of file diff --git a/op_builder/nccl_p2p.py b/op_builder/nccl_p2p.py new file mode 100644 index 000000000..37772572e --- /dev/null +++ b/op_builder/nccl_p2p.py @@ -0,0 +1,26 @@ +from .builder import CUDAOpBuilder + +import sys + + +class NCCLP2PBuilder(CUDAOpBuilder): + BUILD_VAR = 'APEX_BUILD_NCCL_P2P' + INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS" + NAME = "nccl_p2p_cuda" + + def __init__(self): + super().__init__(name=self.NAME) + + def absolute_name(self): + return f'apex.{self.NAME}' + + def sources(self): + return ["contrib/csrc/nccl_p2p/nccl_p2p_cuda.cu", + "contrib/csrc/nccl_p2p/nccl_p2p.cpp"] + + def include_paths(self): + return ['contrib/csrc/'] + + def cxx_args(self): + args = super().cxx_args() + return args + self.version_dependent_macros() + self.generator_args() \ No newline at end of file diff --git a/op_builder/peer_memory.py b/op_builder/peer_memory.py new file mode 100644 index 000000000..c869f0be6 --- /dev/null +++ b/op_builder/peer_memory.py @@ -0,0 +1,26 @@ +from .builder import CUDAOpBuilder + +import sys + + +class PeerMemoryBuilder(CUDAOpBuilder): + BUILD_VAR = 'APEX_BUILD_PEER_MEMORY' + INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS" + NAME = "peer_memory_cuda" + + def __init__(self): + super().__init__(name=self.NAME) + + def absolute_name(self): + return f'apex.{self.NAME}' + + def sources(self): + return ["contrib/csrc/peer_memory/peer_memory_cuda.cu", + "contrib/csrc/peer_memory/peer_memory.cpp"] + + def include_paths(self): + return ['contrib/csrc/'] + + def cxx_args(self): + args = super().cxx_args() + return args + self.version_dependent_macros() + self.generator_args() \ No newline at end of file diff --git a/op_builder/scaled_masked_softmax_cuda.py b/op_builder/scaled_masked_softmax_cuda.py new file mode 100644 index 000000000..1013ef8d2 --- /dev/null +++ b/op_builder/scaled_masked_softmax_cuda.py @@ -0,0 +1,40 @@ +from .builder import CUDAOpBuilder + +class ScaledMaskedSoftmaxCudaBuilder(CUDAOpBuilder): + BUILD_VAR = 'APEX_BUILD_SCALED_MASKED_SOFTMAX_CUDA' + INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS" + NAME = "scaled_masked_softmax_cuda" + + def __init__(self): + super().__init__(name=self.NAME) + + def absolute_name(self): + return f'apex.{self.NAME}' + + def sources(self): + return [ + "csrc/megatron/scaled_masked_softmax_cpu.cpp", + "csrc/megatron/scaled_masked_softmax_cuda.cu" + ] + + def include_paths(self): + # Both csrc and csrc/megatron are included in the original extension + return ['csrc', 'csrc/megatron'] + + def cxx_args(self): + args = super().cxx_args() + return args + self.version_dependent_macros() + + def nvcc_args(self): + nvcc_flags = [ + '-O3', + '-U__CUDA_NO_HALF_OPERATORS__', + '-U__CUDA_NO_HALF_CONVERSIONS__' + ] + self.version_dependent_macros() + if not self.is_rocm_pytorch(): + nvcc_flags.extend( + [ + '--expt-relaxed-constexpr', + '--expt-extended-lambda' + ]) + return nvcc_flags \ No newline at end of file diff --git a/op_builder/scaled_softmax_cuda.py b/op_builder/scaled_softmax_cuda.py new file mode 100644 index 000000000..f29543963 --- /dev/null +++ b/op_builder/scaled_softmax_cuda.py @@ -0,0 +1,41 @@ +from .builder import CUDAOpBuilder + +import sys + +class ScaledSoftmaxCudaBuilder(CUDAOpBuilder): + BUILD_VAR = 'APEX_BUILD_SCALED_SOFTMAX_CUDA' + INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS" + NAME = "scaled_softmax_cuda" + + def __init__(self): + super().__init__(name=self.NAME) + + def absolute_name(self): + return f'apex.{self.NAME}' + + def sources(self): + return [ + "csrc/megatron/scaled_softmax_cpu.cpp", + "csrc/megatron/scaled_softmax_cuda.cu" + ] + + def include_paths(self): + return ['csrc'] + + def cxx_args(self): + args = super().cxx_args() + return args + self.version_dependent_macros() + + def nvcc_args(self): + nvcc_flags = [ + '-O3', + '-U__CUDA_NO_HALF_OPERATORS__', + '-U__CUDA_NO_HALF_CONVERSIONS__' + ] + self.version_dependent_macros() + if not self.is_rocm_pytorch(): + nvcc_flags.extend( + [ + '--expt-relaxed-constexpr', + '--expt-extended-lambda' + ]) + return nvcc_flags \ No newline at end of file diff --git a/op_builder/scaled_upper_triang_masked_softmax_cuda.py b/op_builder/scaled_upper_triang_masked_softmax_cuda.py new file mode 100644 index 000000000..3c2273ad9 --- /dev/null +++ b/op_builder/scaled_upper_triang_masked_softmax_cuda.py @@ -0,0 +1,39 @@ +from .builder import CUDAOpBuilder + +class ScaledUpperTriangMaskedSoftmaxCudaBuilder(CUDAOpBuilder): + BUILD_VAR = 'APEX_BUILD_SCALED_UPPER_TRIANG_MASKED_SOFTMAX_CUDA' + INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS" + NAME = "scaled_upper_triang_masked_softmax_cuda" + + def __init__(self): + super().__init__(name=self.NAME) + + def absolute_name(self): + return f'apex.{self.NAME}' + + def sources(self): + return [ + "csrc/megatron/scaled_upper_triang_masked_softmax_cpu.cpp", + "csrc/megatron/scaled_upper_triang_masked_softmax_cuda.cu" + ] + + def include_paths(self): + return ['csrc'] + + def cxx_args(self): + args = super().cxx_args() + return args + self.version_dependent_macros() + + def nvcc_args(self): + nvcc_flags = [ + '-O3', + '-U__CUDA_NO_HALF_OPERATORS__', + '-U__CUDA_NO_HALF_CONVERSIONS__' + ] + self.version_dependent_macros() + if not self.is_rocm_pytorch(): + nvcc_flags.extend( + [ + '--expt-relaxed-constexpr', + '--expt-extended-lambda' + ]) + return nvcc_flags \ No newline at end of file diff --git a/op_builder/syncbn.py b/op_builder/syncbn.py new file mode 100644 index 000000000..251c33e01 --- /dev/null +++ b/op_builder/syncbn.py @@ -0,0 +1,28 @@ +from .builder import CUDAOpBuilder + +import sys + + +class SyncBnBuilder(CUDAOpBuilder): + BUILD_VAR = 'APEX_BUILD_SYNCBN' + INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS" + NAME = "syncbn" + + def __init__(self): + super().__init__(name=self.NAME) + + def absolute_name(self): + return f'apex.{self.NAME}' + + def sources(self): + return ['csrc/syncbn.cpp', 'csrc/welford.cu'] + + def include_paths(self): + return ['csrc'] + + def cxx_args(self): + args = super().cxx_args() + return args + self.version_dependent_macros() + + def nvcc_args(self): + return ['-O3'] + self.version_dependent_macros() \ No newline at end of file diff --git a/op_builder/transducer_joint.py b/op_builder/transducer_joint.py new file mode 100644 index 000000000..c17f60f7b --- /dev/null +++ b/op_builder/transducer_joint.py @@ -0,0 +1,33 @@ +from .builder import CUDAOpBuilder +import sys + + +class TransducerJointBuilder(CUDAOpBuilder): + BUILD_VAR = 'APEX_BUILD_TRANSDUCER_JOINT' + INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS" + NAME = "transducer_joint_cuda" + + def __init__(self): + super().__init__(name=self.NAME) + + def absolute_name(self): + return f'apex.{self.NAME}' + + def sources(self): + return ["contrib/csrc/transducer/transducer_joint.cpp", + "contrib/csrc/transducer/transducer_joint_kernel.cu"] + + def include_paths(self): + return ['contrib/csrc/', + #it uses philox.cuh from contrib/csrc/multihead_attn + 'contrib/csrc/multihead_attn'] + + def cxx_args(self): + args = super().cxx_args() + return args + self.version_dependent_macros() + self.generator_args() + + def nvcc_args(self): + nvcc_flags = ['-O3'] + self.version_dependent_macros() + self.generator_args() + if not self.is_rocm_pytorch(): + nvcc_flags += self.nvcc_threads_args() + return nvcc_flags \ No newline at end of file diff --git a/op_builder/transducer_loss.py b/op_builder/transducer_loss.py new file mode 100644 index 000000000..53ae4eaac --- /dev/null +++ b/op_builder/transducer_loss.py @@ -0,0 +1,31 @@ +from .builder import CUDAOpBuilder +import sys + + +class TransducerLossBuilder(CUDAOpBuilder): + BUILD_VAR = 'APEX_BUILD_TRANSDUCER_LOSS' + INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS" + NAME = "transducer_loss_cuda" + + def __init__(self): + super().__init__(name=self.NAME) + + def absolute_name(self): + return f'apex.{self.NAME}' + + def sources(self): + return ["contrib/csrc/transducer/transducer_loss.cpp", + "contrib/csrc/transducer/transducer_loss_kernel.cu"] + + def include_paths(self): + return ['contrib/csrc/' ] + + def cxx_args(self): + args = super().cxx_args() + return args + self.version_dependent_macros() + + def nvcc_args(self): + nvcc_flags = ['-O3'] + self.version_dependent_macros() + if not self.is_rocm_pytorch(): + nvcc_flags += self.nvcc_threads_args() + return nvcc_flags \ No newline at end of file diff --git a/op_builder/xentropy.py b/op_builder/xentropy.py new file mode 100644 index 000000000..84f3ddf12 --- /dev/null +++ b/op_builder/xentropy.py @@ -0,0 +1,29 @@ +from .builder import CUDAOpBuilder + +import sys + + +class XentropyBuilder(CUDAOpBuilder): + BUILD_VAR = 'APEX_BUILD_XENTROPY' + INCLUDE_FLAG = "APEX_BUILD_CUDA_OPS" + NAME = "xentropy_cuda" + + def __init__(self): + super().__init__(name=self.NAME) + + def absolute_name(self): + return f'apex.{self.NAME}' + + def sources(self): + return ['contrib/csrc/xentropy/interface.cpp', + 'contrib/csrc/xentropy/xentropy_kernel.cu'] + + def include_paths(self): + return ['csrc', 'contrib/csrc/' ] + + def cxx_args(self): + args = super().cxx_args() + return args + self.version_dependent_macros() + + def nvcc_args(self): + return ['-O3'] + self.version_dependent_macros() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 241f90a94..d527b4498 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,8 @@ PyYAML>=5.1 pytest>=3.5.1 packaging>=14.0 matplotlib>=3.8 -pandas>=2.2.2 \ No newline at end of file +pandas>=2.2.2 +py-cpuinfo +build +ninja +wheel \ No newline at end of file diff --git a/scripts/clean.py b/scripts/clean.py new file mode 100644 index 000000000..be7e69798 --- /dev/null +++ b/scripts/clean.py @@ -0,0 +1,16 @@ +import torch.utils.cpp_extension +import shutil +import os +import sys + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from op_builder.all_ops import ALL_OPS + +torch_ext_directory = torch.utils.cpp_extension._get_build_directory("", False) + +install_ops = dict.fromkeys(ALL_OPS.keys(), False) +for op_name, builder in ALL_OPS.items(): + path = os.path.join(torch_ext_directory, op_name) + if os.path.exists(path): + print ("removing torch extension", op_name, "at", torch_ext_directory) + shutil.rmtree(path) \ No newline at end of file diff --git a/setup.py b/setup.py index c4044a0a3..816849e7c 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import glob from packaging.version import parse, Version -from setuptools import setup, find_packages +from setuptools import setup, find_packages, Distribution import subprocess import torch @@ -17,44 +17,17 @@ load, ) +import typing +import shlex + +sys.path.insert(0, os.path.abspath(os.path.dirname(__file__))) + +from op_builder.all_ops import ALL_OPS +import shutil # ninja build does not work unless include_dirs are abs path this_dir = os.path.dirname(os.path.abspath(__file__)) -torch_dir = torch.__path__[0] - - -# https://github.com/pytorch/pytorch/pull/71881 -# For the extensions which have rocblas_gemm_flags_fp16_alt_impl we need to make sure if at::BackwardPassGuard exists. -# It helps the extensions be backward compatible with old PyTorch versions. -# The check and ROCM_BACKWARD_PASS_GUARD in nvcc/hipcc args can be retired once the PR is merged into PyTorch upstream. - -context_file = os.path.join(torch_dir, "include", "ATen", "Context.h") -if os.path.exists(context_file): - lines = open(context_file, 'r').readlines() - found_Backward_Pass_Guard = False - found_ROCmBackward_Pass_Guard = False - for line in lines: - if "BackwardPassGuard" in line: - # BackwardPassGuard has been renamed to ROCmBackwardPassGuard - # https://github.com/pytorch/pytorch/pull/71881/commits/4b82f5a67a35406ffb5691c69e6b4c9086316a43 - if "ROCmBackwardPassGuard" in line: - found_ROCmBackward_Pass_Guard = True - else: - found_Backward_Pass_Guard = True - break - -found_aten_atomic_header = False -if os.path.exists(os.path.join(torch_dir, "include", "ATen", "Atomic.cuh")): - found_aten_atomic_header = True - -def raise_if_cuda_home_none(global_option: str) -> None: - if CUDA_HOME is not None or ROCM_HOME is not None: - return - raise RuntimeError( - f"{global_option} was requested, but nvcc was not found. Are you sure your environment has nvcc available? " - "If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, " - "only images whose names contain 'devel' will provide nvcc." - ) + def get_cuda_bare_metal_version(cuda_dir): raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True) @@ -74,50 +47,6 @@ def get_rocm_bare_metal_version(rocm_dir): bare_metal_minor = release[1][0] return raw_output, bare_metal_major, bare_metal_minor -def check_cuda_torch_binary_vs_bare_metal(cuda_dir): - raw_output, bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(cuda_dir) - torch_binary_major = torch.version.cuda.split(".")[0] - torch_binary_minor = torch.version.cuda.split(".")[1] - - print("\nCompiling cuda extensions with") - print(raw_output + "from " + cuda_dir + "/bin\n") - - if (bare_metal_major != torch_binary_major) or (bare_metal_minor != torch_binary_minor): - raise RuntimeError( - "Cuda extensions are being compiled with a version of Cuda that does " - "not match the version used to compile Pytorch binaries. " - "Pytorch binaries were compiled with Cuda {}.\n".format(torch.version.cuda) - + "In some cases, a minor-version mismatch will not cause later errors: " - "https://github.com/NVIDIA/apex/pull/323#discussion_r287021798. " - "You can try commenting out this check (at your own risk)." - ) - -def check_rocm_torch_binary_vs_bare_metal(rocm_dir): - raw_output, bare_metal_major, bare_metal_minor = get_rocm_bare_metal_version(rocm_dir) - torch_binary_major = torch.version.hip.split(".")[0] - torch_binary_minor = torch.version.hip.split(".")[1] - - print("\nCompiling rocm extensions with") - print(raw_output + "from " + rocm_dir + "/bin\n") - - if (bare_metal_major != torch_binary_major) or (bare_metal_minor != torch_binary_minor): - raise RuntimeError( - "Cuda extensions are being compiled with a version of Cuda that does " - "not match the version used to compile Pytorch binaries. " - "Pytorch binaries were compiled with Cuda {}.\n".format(torch.version.cuda) - + "In some cases, a minor-version mismatch will not cause later errors: " - "https://github.com/NVIDIA/apex/pull/323#discussion_r287021798. " - "You can try commenting out this check (at your own risk)." - ) - -def raise_if_home_none(global_option: str) -> None: - if CUDA_HOME is not None or ROCM_HOME is not None: - return - raise RuntimeError( - f"{global_option} was requested, but nvcc was not found. Are you sure your environment has nvcc available? " - "If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, " - "only images whose names contain 'devel' will provide nvcc." - ) def get_apex_version(): cwd = os.path.dirname(os.path.abspath(__file__)) @@ -135,23 +64,6 @@ def get_apex_version(): apex_version += ".git"+os.getenv("APEX_COMMIT")[:8] return apex_version -def append_nvcc_threads(nvcc_extra_args): - _, bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(CUDA_HOME) - if int(bare_metal_major) >= 11 and int(bare_metal_minor) >= 2: - return nvcc_extra_args + ["--threads", "4"] - return nvcc_extra_args - - -def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int) -> bool: - cudnn_available = torch.backends.cudnn.is_available() - cudnn_version = torch.backends.cudnn.version() if cudnn_available else None - if not (cudnn_available and (cudnn_version >= required_cudnn_version)): - warnings.warn( - f"Skip `{global_option}` as it requires cuDNN {required_cudnn_version} or later, " - f"but {'cuDNN is not available' if not cudnn_available else cudnn_version}" - ) - return False - return True print("\n\ntorch.__version__ = {}\n\n".format(torch.__version__)) TORCH_MAJOR = int(torch.__version__.split('.')[0]) @@ -202,779 +114,154 @@ def check_if_rocm_pytorch(): ) # cmdclass = {} -ext_modules = [] - extras = {} -# Set up macros for forward/backward compatibility hack around -# https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e -# and -# https://github.com/NVIDIA/apex/issues/456 -# https://github.com/pytorch/pytorch/commit/eb7b39e02f7d75c26d8a795ea8c7fd911334da7e#diff-4632522f237f1e4e728cb824300403ac -version_ge_1_1 = [] -if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0): - version_ge_1_1 = ["-DVERSION_GE_1_1"] -version_ge_1_3 = [] -if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2): - version_ge_1_3 = ["-DVERSION_GE_1_3"] -version_ge_1_5 = [] -if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4): - version_ge_1_5 = ["-DVERSION_GE_1_5"] -version_dependent_macros = version_ge_1_1 + version_ge_1_3 + version_ge_1_5 - if not IS_ROCM_PYTORCH: _, bare_metal_version = get_cuda_bare_metal_version(CUDA_HOME) else: _, bare_metal_version, bare_metal_minor = get_rocm_bare_metal_version(ROCM_HOME) -if IS_ROCM_PYTORCH and (ROCM_MAJOR >= 6): - version_dependent_macros += ["-DHIPBLAS_V2"] +# ***************************** Op builder ********************** + +def get_env_if_set(key, default: typing.Any = ""): + """ + Returns an environment variable if it is set and not "", + otherwise returns a default value. In contrast, the fallback + parameter of os.environ.get() is skipped if the variable is set to "". + """ + return os.environ.get(key, None) or default -if "--cpp_ext" in sys.argv or "--cuda_ext" in sys.argv: +def command_exists(cmd): + if sys.platform == "win32": + safe_cmd = shlex.split(f'{cmd}') + result = subprocess.Popen(safe_cmd, stdout=subprocess.PIPE) + return result.wait() == 1 + else: + safe_cmd = shlex.split(f"bash -c type {cmd}") + result = subprocess.Popen(safe_cmd, stdout=subprocess.PIPE) + return result.wait() == 0 + +BUILD_OP_DEFAULT = 0 +BUILD_CPP_OPS = int(get_env_if_set('APEX_BUILD_CPP_OPS', BUILD_OP_DEFAULT)) +BUILD_CUDA_OPS = int(get_env_if_set('APEX_BUILD_CUDA_OPS', BUILD_OP_DEFAULT)) +build_flags = { + "APEX_BUILD_CPP_OPS" : BUILD_CPP_OPS, + "APEX_BUILD_CUDA_OPS" : BUILD_CUDA_OPS, + } + +if BUILD_CPP_OPS or BUILD_CUDA_OPS: if TORCH_MAJOR == 0: raise RuntimeError("--cpp_ext requires Pytorch 1.0 or later, " "found torch.__version__ = {}".format(torch.__version__) ) -if "--cpp_ext" in sys.argv: - sys.argv.remove("--cpp_ext") - ext_modules.append(CppExtension("apex_C", ["csrc/flatten_unflatten.cpp"])) - -if "--distributed_adam" in sys.argv or "--cuda_ext" in sys.argv: - if "--distributed_adam" in sys.argv: - sys.argv.remove("--distributed_adam") - - raise_if_home_none("--distributed_adam") - nvcc_args_adam = ['-O3', '--use_fast_math'] + version_dependent_macros - hipcc_args_adam = ['-O3'] + version_dependent_macros - ext_modules.append( - CUDAExtension( - name='distributed_adam_cuda', - sources=[ - 'apex/contrib/csrc/optimizers/multi_tensor_distopt_adam.cpp', - 'apex/contrib/csrc/optimizers/multi_tensor_distopt_adam_kernel.cu', - ], - include_dirs=[ - os.path.join(this_dir, 'csrc'), - os.path.join(this_dir, 'apex/contrib/csrc/optimizers'), - ], - extra_compile_args={ - 'cxx': ['-O3',] + version_dependent_macros, - 'nvcc':nvcc_args_adam if not IS_ROCM_PYTORCH else hipcc_args_adam, - } - ) - ) - -if "--distributed_lamb" in sys.argv or "--cuda_ext" in sys.argv: - if "--distributed_lamb" in sys.argv: - sys.argv.remove("--distributed_lamb") - - raise_if_home_none("--distributed_lamb") - - print ("INFO: Building the distributed_lamb extension.") - nvcc_args_distributed_lamb = ['-O3', '--use_fast_math'] + version_dependent_macros - hipcc_args_distributed_lamb = ['-O3'] + version_dependent_macros - ext_modules.append( - CUDAExtension( - name='distributed_lamb_cuda', - sources=[ - 'apex/contrib/csrc/optimizers/multi_tensor_distopt_lamb.cpp', - 'apex/contrib/csrc/optimizers/multi_tensor_distopt_lamb_kernel.cu', - ], - include_dirs=[os.path.join(this_dir, 'csrc')], - extra_compile_args={ - 'cxx': ['-O3',] + version_dependent_macros, - 'nvcc': nvcc_args_distributed_lamb if not IS_ROCM_PYTORCH else hipcc_args_distributed_lamb, - } - ) - ) - - -if "--cuda_ext" in sys.argv: - raise_if_home_none("--cuda_ext") - - if not IS_ROCM_PYTORCH: - check_cuda_torch_binary_vs_bare_metal(CUDA_HOME) - else: - check_rocm_torch_binary_vs_bare_metal(ROCM_HOME) - -#********** multi-tensor apply **************** - print ("INFO: Building the multi-tensor apply extension.") - nvcc_args_multi_tensor = ['-lineinfo', '-O3', '--use_fast_math'] + version_dependent_macros - hipcc_args_multi_tensor = ['-O3'] + version_dependent_macros - ext_modules.append( - CUDAExtension( - name='amp_C', - sources=[ - 'csrc/amp_C_frontend.cpp', - 'csrc/multi_tensor_sgd_kernel.cu', - 'csrc/multi_tensor_scale_kernel.cu', - 'csrc/multi_tensor_axpby_kernel.cu', - 'csrc/multi_tensor_l2norm_kernel.cu', - 'csrc/multi_tensor_l2norm_kernel_mp.cu', - 'csrc/multi_tensor_l2norm_scale_kernel.cu', - 'csrc/multi_tensor_lamb_stage_1.cu', - 'csrc/multi_tensor_lamb_stage_2.cu', - 'csrc/multi_tensor_adam.cu', - 'csrc/multi_tensor_adagrad.cu', - 'csrc/multi_tensor_novograd.cu', - 'csrc/multi_tensor_lars.cu', - 'csrc/multi_tensor_lamb.cu', - 'csrc/multi_tensor_lamb_mp.cu'], - include_dirs=[os.path.join(this_dir, 'csrc')], - extra_compile_args={'cxx': ['-O3'] + version_dependent_macros, - 'nvcc': nvcc_args_multi_tensor if not IS_ROCM_PYTORCH else hipcc_args_multi_tensor, - } - ) - ) - -#********** syncbn **************** - print("INFO: Building syncbn extension.") - ext_modules.append( - CUDAExtension( - name='syncbn', - sources=[ - 'csrc/syncbn.cpp', - 'csrc/welford.cu', - ], - include_dirs=[os.path.join(this_dir, 'csrc')], - extra_compile_args={ - 'cxx': ['-O3'] + version_dependent_macros, - 'nvcc':['-O3'] + version_dependent_macros, - } - ) - ) - -#********** fused layernorm **************** - nvcc_args_layer_norm = ['-maxrregcount=50', '-O3', '--use_fast_math'] + version_dependent_macros - hipcc_args_layer_norm = ['-O3'] + version_dependent_macros - - print ("INFO: Building fused layernorm extension.") - ext_modules.append( - CUDAExtension( - name='fused_layer_norm_cuda', - sources=[ - 'csrc/layer_norm_cuda.cpp', - 'csrc/layer_norm_cuda_kernel.cu', - ], - include_dirs=[os.path.join(this_dir, 'csrc')], - extra_compile_args={ - 'cxx': ['-O3'] + version_dependent_macros, - 'nvcc': nvcc_args_layer_norm if not IS_ROCM_PYTORCH else hipcc_args_layer_norm, - } - ) - ) - -#********** fused dense **************** - ext_modules.append( - CUDAExtension( - name='fused_dense_cuda', - sources=[ - 'csrc/fused_dense_base.cpp', - 'csrc/fused_dense_cuda.cu', - ], - extra_compile_args={ - 'cxx': ['-O3'] + version_dependent_macros, - 'nvcc':['-O3'] + version_dependent_macros - } - ) - ) - - bare_metal_version = Version(bare_metal_version) - print("Bare Metal Version : ", bare_metal_version) - if True: - - cc_flag = [] - cc_flag.append("-gencode") - cc_flag.append("arch=compute_70,code=sm_70") - cc_flag.append("-gencode") - cc_flag.append("arch=compute_80,code=sm_80") - if bare_metal_version >= Version("11.1"): - cc_flag.append("-gencode") - cc_flag.append("arch=compute_86,code=sm_86") - if bare_metal_version >= Version("11.8"): - cc_flag.append("-gencode") - cc_flag.append("arch=compute_90,code=sm_90") - - nvcc_args_fused_weight_gradient = [ - "-O3", - "-U__CUDA_NO_HALF_OPERATORS__", - "-U__CUDA_NO_HALF_CONVERSIONS__", - "--expt-relaxed-constexpr", - "--expt-extended-lambda", - "--use_fast_math", - ] + version_dependent_macros + cc_flag - - hipcc_args_fused_weight_gradient = [ - "-O3", - "-U__CUDA_NO_HALF_OPERATORS__", - "-U__CUDA_NO_HALF_CONVERSIONS__" - ] + version_dependent_macros - - ext_modules.append( - CUDAExtension( - name="fused_weight_gradient_mlp_cuda", - include_dirs=[os.path.join(this_dir, "csrc")], - sources=[ - "csrc/megatron/fused_weight_gradient_dense.cpp", - "csrc/megatron/fused_weight_gradient_dense_cuda.cu", - "csrc/megatron/fused_weight_gradient_dense_16bit_prec_cuda.cu", - ], - extra_compile_args={ - "cxx": ["-O3"] + version_dependent_macros, - "nvcc": nvcc_args_fused_weight_gradient if not IS_ROCM_PYTORCH else hipcc_args_fused_weight_gradient, - }, - ) - ) -#********** mlp_cuda **************** - hipcc_args_mlp = ['-O3'] + version_dependent_macros - if found_Backward_Pass_Guard: - hipcc_args_mlp = hipcc_args_mlp + ['-DBACKWARD_PASS_GUARD'] + ['-DBACKWARD_PASS_GUARD_CLASS=BackwardPassGuard'] - if found_ROCmBackward_Pass_Guard: - hipcc_args_mlp = hipcc_args_mlp + ['-DBACKWARD_PASS_GUARD'] + ['-DBACKWARD_PASS_GUARD_CLASS=ROCmBackwardPassGuard'] - - print ("INFO: Building the MLP Extension.") - ext_modules.append( - CUDAExtension( - name='mlp_cuda', - sources=[ - 'csrc/mlp.cpp', - 'csrc/mlp_cuda.cu', - ], - include_dirs=[os.path.join(this_dir, 'csrc')], - extra_compile_args={ - 'cxx': ['-O3'] + version_dependent_macros, - 'nvcc':['-O3'] + version_dependent_macros if not IS_ROCM_PYTORCH else hipcc_args_mlp, - } - ) - ) - -#********** scaled_upper_triang_masked_softmax_cuda **************** - nvcc_args_transformer = ['-O3', - '-U__CUDA_NO_HALF_OPERATORS__', - '-U__CUDA_NO_HALF_CONVERSIONS__', - '--expt-relaxed-constexpr', - '--expt-extended-lambda'] + version_dependent_macros - hipcc_args_transformer = ['-O3', - '-U__CUDA_NO_HALF_OPERATORS__', - '-U__CUDA_NO_HALF_CONVERSIONS__'] + version_dependent_macros - - ext_modules.append( - CUDAExtension( - name='scaled_upper_triang_masked_softmax_cuda', - sources=[ - 'csrc/megatron/scaled_upper_triang_masked_softmax_cpu.cpp', - 'csrc/megatron/scaled_upper_triang_masked_softmax_cuda.cu', - ], - include_dirs=[os.path.join(this_dir, 'csrc')], - extra_compile_args={ - 'cxx': ['-O3'] + version_dependent_macros, - 'nvcc':nvcc_args_transformer if not IS_ROCM_PYTORCH else hipcc_args_transformer, - } - ) - ) -#*********** generic_scaled_masked_softmax_cuda **************** - ext_modules.append( - CUDAExtension( - name="generic_scaled_masked_softmax_cuda", - sources=[ - "csrc/megatron/generic_scaled_masked_softmax_cpu.cpp", - "csrc/megatron/generic_scaled_masked_softmax_cuda.cu", - ], - include_dirs=[os.path.join(this_dir, "csrc")], - extra_compile_args={ - "cxx": ["-O3"] + version_dependent_macros, - "nvcc": nvcc_args_transformer if not IS_ROCM_PYTORCH else hipcc_args_transformer, - }, - ) - ) - - -#*********** scaled_masked_softmax_cuda **************** - ext_modules.append( - CUDAExtension( - name='scaled_masked_softmax_cuda', - sources=[ - 'csrc/megatron/scaled_masked_softmax_cpu.cpp', - 'csrc/megatron/scaled_masked_softmax_cuda.cu', - ], - include_dirs=[os.path.join(this_dir, 'csrc'), - os.path.join(this_dir, 'csrc/megatron')], - extra_compile_args={ - 'cxx': ['-O3'] + version_dependent_macros, - 'nvcc':nvcc_args_transformer if not IS_ROCM_PYTORCH else hipcc_args_transformer, - } - ) - ) - -#*********** scaled_softmax_cuda **************** - ext_modules.append( - CUDAExtension( - name="scaled_softmax_cuda", - sources=[ - "csrc/megatron/scaled_softmax_cpu.cpp", - "csrc/megatron/scaled_softmax_cuda.cu", - ], - include_dirs=[os.path.join(this_dir, "csrc")], - extra_compile_args={ - "cxx": ["-O3"] + version_dependent_macros, - "nvcc":nvcc_args_transformer if not IS_ROCM_PYTORCH else hipcc_args_transformer, - } - ) - ) - -#*********** fused_rotary_positional_embedding **************** - if IS_ROCM_PYTORCH and "--aiter" in sys.argv: - sys.argv.remove("--aiter") - subprocess.run(["pip", "install", "."], cwd = "third_party/aiter") - - ext_modules.append( - CUDAExtension( - name="fused_rotary_positional_embedding", - sources=[ - "csrc/megatron/fused_rotary_positional_embedding.cpp", - "csrc/megatron/fused_rotary_positional_embedding_cuda.cu", - ], - include_dirs=[os.path.join(this_dir, "csrc")], - extra_compile_args={ - "cxx": ["-O3"] + version_dependent_macros, - "nvcc":nvcc_args_transformer if not IS_ROCM_PYTORCH else hipcc_args_transformer, - } - ) - ) - -#*********** fused_bias_swiglu **************** - nvcc_args_swiglu = ['-O3', - '-U__CUDA_NO_HALF_OPERATORS__', - '-U__CUDA_NO_HALF_CONVERSIONS__', - '--expt-relaxed-constexpr', - '--expt-extended-lambda'] + version_dependent_macros - hipcc_args_swiglu = ['-O3', - '-U__CUDA_NO_HALF_OPERATORS__', - '-U__CUDA_NO_HALF_CONVERSIONS__'] + version_dependent_macros - - if IS_ROCM_PYTORCH: - try: - amdgpu_targets = os.environ.get('PYTORCH_ROCM_ARCH', '') - if not amdgpu_targets: - print("Warning: PYTORCH_ROCM_ARCH environment variable is empty.") - print("Using default architecture. Set this variable for specific GPU targets.") - print("Example: export PYTORCH_ROCM_ARCH=gfx906") - amdgpu_targets = "gfx906" # Default to a common architecture - - # Handle multiple architectures (separated by semicolons) - for amdgpu_target in amdgpu_targets.split(';'): - if amdgpu_target: # Skip empty strings - hipcc_args_swiglu += [f'--offload-arch={amdgpu_target}'] - except Exception as e: - print(f"Warning: Error processing PYTORCH_ROCM_ARCH: {e}") - print("Falling back to default architecture gfx906") - hipcc_args_swiglu += ['--offload-arch=gfx906'] - - - ext_modules.append( - CUDAExtension( - name="fused_bias_swiglu", - sources=[ - "csrc/megatron/fused_bias_swiglu.cpp", - "csrc/megatron/fused_bias_swiglu_cuda.cu", - ], - include_dirs=[os.path.join(this_dir, "csrc")], - extra_compile_args={ - "cxx": ["-O3"] + version_dependent_macros, - "nvcc": nvcc_args_swiglu if not IS_ROCM_PYTORCH else hipcc_args_swiglu, - } - ) - ) - -if "--bnp" in sys.argv or "--cuda_ext" in sys.argv: - - if "--bnp" in sys.argv: - sys.argv.remove("--bnp") +def is_env_set(key): + """ + Checks if an environment variable is set and not "". + """ + return bool(os.environ.get(key, None)) - if torch.utils.cpp_extension.CUDA_HOME is None and not IS_ROCM_PYTORCH: - raise RuntimeError("--bnp was requested, but nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.") - else: - ext_modules.append( - CUDAExtension(name='bnp', - sources=['apex/contrib/csrc/groupbn/batch_norm.cu', - 'apex/contrib/csrc/groupbn/ipc.cu', - 'apex/contrib/csrc/groupbn/interface.cpp', - 'apex/contrib/csrc/groupbn/batch_norm_add_relu.cu'], - include_dirs=[os.path.join(this_dir, 'csrc'), - os.path.join(this_dir, 'apex/contrib/csrc/groupbn')], - extra_compile_args={'cxx': [] + version_dependent_macros, - 'nvcc':['-DCUDA_HAS_FP16=1', - '-D__CUDA_NO_HALF_OPERATORS__', - '-D__CUDA_NO_HALF_CONVERSIONS__', - '-D__CUDA_NO_HALF2_OPERATORS__'] + version_dependent_macros})) - -if "--xentropy" in sys.argv or "--cuda_ext" in sys.argv: - if "--xentropy" in sys.argv: - sys.argv.remove("--xentropy") - - if torch.utils.cpp_extension.CUDA_HOME is None and not IS_ROCM_PYTORCH: - raise RuntimeError("--xentropy was requested, but nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.") - else: - print ("INFO: Building the xentropy extension.") - ext_modules.append( - CUDAExtension(name='xentropy_cuda', - sources=['apex/contrib/csrc/xentropy/interface.cpp', - 'apex/contrib/csrc/xentropy/xentropy_kernel.cu'], - include_dirs=[os.path.join(this_dir, 'csrc'), - os.path.join(this_dir, 'apex/contrib/csrc/xentropy')], - extra_compile_args={'cxx': ['-O3'] + version_dependent_macros, - 'nvcc':['-O3'] + version_dependent_macros})) - -if "--focal_loss" in sys.argv or "--cuda_ext" in sys.argv: - if "--focal_loss" in sys.argv: - sys.argv.remove("--focal_loss") - ext_modules.append( - CUDAExtension( - name='focal_loss_cuda', - sources=[ - 'apex/contrib/csrc/focal_loss/focal_loss_cuda.cpp', - 'apex/contrib/csrc/focal_loss/focal_loss_cuda_kernel.cu', - ], - include_dirs=[os.path.join(this_dir, 'csrc')], - extra_compile_args={ - 'cxx': ['-O3'] + version_dependent_macros, - 'nvcc':(['-O3', '--use_fast_math', '--ftz=false'] if not IS_ROCM_PYTORCH else ['-O3']) + version_dependent_macros, - }, - ) - ) - -if "--index_mul_2d" in sys.argv or "--cuda_ext" in sys.argv: - if "--index_mul_2d" in sys.argv: - sys.argv.remove("--index_mul_2d") - - args_index_mul_2d = ['-O3'] - if not IS_ROCM_PYTORCH: - args_index_mul_2d += ['--use_fast_math', '--ftz=false'] - if found_aten_atomic_header: - args_index_mul_2d += ['-DATEN_ATOMIC_HEADER'] - - ext_modules.append( - CUDAExtension( - name='fused_index_mul_2d', - sources=[ - 'apex/contrib/csrc/index_mul_2d/index_mul_2d_cuda.cpp', - 'apex/contrib/csrc/index_mul_2d/index_mul_2d_cuda_kernel.cu', - ], - include_dirs=[os.path.join(this_dir, 'csrc')], - extra_compile_args={ - 'cxx': ['-O3'] + version_dependent_macros, - 'nvcc': args_index_mul_2d + version_dependent_macros, - }, - ) - ) - -if "--deprecated_fused_adam" in sys.argv or "--cuda_ext" in sys.argv: - if "--deprecated_fused_adam" in sys.argv: - sys.argv.remove("--deprecated_fused_adam") - - if torch.utils.cpp_extension.CUDA_HOME is None and not IS_ROCM_PYTORCH: - raise RuntimeError("--deprecated_fused_adam was requested, but nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.") - else: - print ("INFO: Building deprecated fused adam extension.") - nvcc_args_fused_adam = ['-O3', '--use_fast_math'] + version_dependent_macros - hipcc_args_fused_adam = ['-O3'] + version_dependent_macros - ext_modules.append( - CUDAExtension(name='fused_adam_cuda', - sources=['apex/contrib/csrc/optimizers/fused_adam_cuda.cpp', - 'apex/contrib/csrc/optimizers/fused_adam_cuda_kernel.cu'], - include_dirs=[os.path.join(this_dir, 'csrc'), - os.path.join(this_dir, 'apex/contrib/csrc/optimizers')], - extra_compile_args={'cxx': ['-O3'] + version_dependent_macros, - 'nvcc' : nvcc_args_fused_adam if not IS_ROCM_PYTORCH else hipcc_args_fused_adam})) - -if "--deprecated_fused_lamb" in sys.argv or "--cuda_ext" in sys.argv: - if "--deprecated_fused_lamb" in sys.argv: - sys.argv.remove("--deprecated_fused_lamb") - - if torch.utils.cpp_extension.CUDA_HOME is None and not IS_ROCM_PYTORCH: - raise RuntimeError("--deprecated_fused_lamb was requested, but nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.") - else: - print ("INFO: Building deprecated fused lamb extension.") - nvcc_args_fused_lamb = ['-O3', '--use_fast_math'] + version_dependent_macros - hipcc_args_fused_lamb = ['-O3'] + version_dependent_macros - ext_modules.append( - CUDAExtension(name='fused_lamb_cuda', - sources=['apex/contrib/csrc/optimizers/fused_lamb_cuda.cpp', - 'apex/contrib/csrc/optimizers/fused_lamb_cuda_kernel.cu', - 'csrc/multi_tensor_l2norm_kernel.cu'], - include_dirs=[os.path.join(this_dir, 'csrc')], - extra_compile_args = nvcc_args_fused_lamb if not IS_ROCM_PYTORCH else hipcc_args_fused_lamb)) - -# Check, if ATen/CUDAGeneratorImpl.h is found, otherwise use ATen/cuda/CUDAGeneratorImpl.h -# See https://github.com/pytorch/pytorch/pull/70650 -generator_flag = [] -torch_dir = torch.__path__[0] -if os.path.exists(os.path.join(torch_dir, "include", "ATen", "CUDAGeneratorImpl.h")): - generator_flag = ["-DOLD_GENERATOR_PATH"] - -if "--fast_layer_norm" in sys.argv: - sys.argv.remove("--fast_layer_norm") - raise_if_cuda_home_none("--fast_layer_norm") - # Check, if CUDA11 is installed for compute capability 8.0 - cc_flag = [] - _, bare_metal_major, _ = get_cuda_bare_metal_version(CUDA_HOME) - if int(bare_metal_major) >= 11: - cc_flag.append("-gencode") - cc_flag.append("arch=compute_80,code=sm_80") - - if CUDA_HOME is None and not IS_ROCM_PYTORCH: - raise RuntimeError("--fast_layer_norm was requested, but nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.") - else: - # Check, if CUDA11 is installed for compute capability 8.0 - cc_flag = [] - _, bare_metal_major, _ = get_cuda_bare_metal_version(CUDA_HOME) - if int(bare_metal_major) >= 11: - cc_flag.append('-gencode') - cc_flag.append('arch=compute_80,code=sm_80') - -if "--fmha" in sys.argv: - sys.argv.remove("--fmha") - raise_if_cuda_home_none("--fmha") - # Check, if CUDA11 is installed for compute capability 8.0 - cc_flag = [] - _, bare_metal_major, _ = get_cuda_bare_metal_version(CUDA_HOME) - if int(bare_metal_major) < 11: - raise RuntimeError("--fmha only supported on SM80") - cc_flag.append("-gencode") - cc_flag.append("arch=compute_80,code=sm_80") - - if CUDA_HOME is None and not IS_ROCM_PYTORCH: - raise RuntimeError("--fmha was requested, but nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.") - else: - # Check, if CUDA11 is installed for compute capability 8.0 - cc_flag = [] - _, bare_metal_major, _ = get_cuda_bare_metal_version(CUDA_HOME) - if int(bare_metal_major) < 11: - raise RuntimeError("--fmha only supported on SM80") - - ext_modules.append( - CUDAExtension(name='fmhalib', - sources=[ - 'apex/contrib/csrc/fmha/fmha_api.cpp', - 'apex/contrib/csrc/fmha/src/fmha_noloop_reduce.cu', - 'apex/contrib/csrc/fmha/src/fmha_fprop_fp16_128_64_kernel.sm80.cu', - 'apex/contrib/csrc/fmha/src/fmha_fprop_fp16_256_64_kernel.sm80.cu', - 'apex/contrib/csrc/fmha/src/fmha_fprop_fp16_384_64_kernel.sm80.cu', - 'apex/contrib/csrc/fmha/src/fmha_fprop_fp16_512_64_kernel.sm80.cu', - 'apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_128_64_kernel.sm80.cu', - 'apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_256_64_kernel.sm80.cu', - 'apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_384_64_kernel.sm80.cu', - 'apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_512_64_kernel.sm80.cu', - ], - extra_compile_args={'cxx': ['-O3', - ] + version_dependent_macros + generator_flag, - 'nvcc':['-O3', - '-gencode', 'arch=compute_80,code=sm_80', - '-U__CUDA_NO_HALF_OPERATORS__', - '-U__CUDA_NO_HALF_CONVERSIONS__', - '--expt-relaxed-constexpr', - '--expt-extended-lambda', - '--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag}, - include_dirs=[os.path.join(this_dir, "apex/contrib/csrc"), os.path.join(this_dir, "apex/contrib/csrc/fmha/src")])) - - -if "--fast_multihead_attn" in sys.argv or "--cuda_ext" in sys.argv: - if "--fast_multihead_attn" in sys.argv: - sys.argv.remove("--fast_multihead_attn") - - if torch.utils.cpp_extension.CUDA_HOME is None and not IS_ROCM_PYTORCH: - raise RuntimeError("--fast_multihead_attn was requested, but nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.") - else: - # Check, if CUDA11 is installed for compute capability 8.0 - cc_flag = [] - if not IS_ROCM_PYTORCH: - _, bare_metal_major, _ = get_cuda_bare_metal_version(torch.utils.cpp_extension.CUDA_HOME) - if int(bare_metal_major) >= 11: - cc_flag.append('-gencode') - cc_flag.append('arch=compute_80,code=sm_80') - cc_flag.append('-gencode') - cc_flag.append('arch=compute_86,code=sm_86') - - subprocess.run(["git", "submodule", "update", "--init", "apex/contrib/csrc/multihead_attn/cutlass"]) - nvcc_args_mha = ['-O3', - '-gencode', - 'arch=compute_70,code=sm_70', - '-Iapex/contrib/csrc/multihead_attn/cutlass', - '-U__CUDA_NO_HALF_OPERATORS__', - '-U__CUDA_NO_HALF_CONVERSIONS__', - '--expt-relaxed-constexpr', - '--expt-extended-lambda', - '--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag - hipcc_args_mha = ['-O3', - '-Iapex/contrib/csrc/multihead_attn/cutlass', - '-I/opt/rocm/include/hiprand', - '-I/opt/rocm/include/rocrand', - '-U__HIP_NO_HALF_OPERATORS__', - '-U__HIP_NO_HALF_CONVERSIONS__'] + version_dependent_macros + generator_flag - if found_Backward_Pass_Guard: - hipcc_args_mha = hipcc_args_mha + ['-DBACKWARD_PASS_GUARD'] + ['-DBACKWARD_PASS_GUARD_CLASS=BackwardPassGuard'] - if found_ROCmBackward_Pass_Guard: - hipcc_args_mha = hipcc_args_mha + ['-DBACKWARD_PASS_GUARD'] + ['-DBACKWARD_PASS_GUARD_CLASS=ROCmBackwardPassGuard'] - - ext_modules.append( - CUDAExtension( - name='fast_multihead_attn', - sources=[ - 'apex/contrib/csrc/multihead_attn/multihead_attn_frontend.cpp', - 'apex/contrib/csrc/multihead_attn/additive_masked_softmax_dropout_cuda.cu', - "apex/contrib/csrc/multihead_attn/masked_softmax_dropout_cuda.cu", - "apex/contrib/csrc/multihead_attn/encdec_multihead_attn_cuda.cu", - "apex/contrib/csrc/multihead_attn/encdec_multihead_attn_norm_add_cuda.cu", - "apex/contrib/csrc/multihead_attn/self_multihead_attn_cuda.cu", - "apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_additive_mask_cuda.cu", - "apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_cuda.cu", - "apex/contrib/csrc/multihead_attn/self_multihead_attn_norm_add_cuda.cu", - ], - include_dirs=[os.path.join(this_dir, 'csrc'), - os.path.join(this_dir, 'apex/contrib/csrc/multihead_attn')], - extra_compile_args={'cxx': ['-O3',] + version_dependent_macros + generator_flag, - 'nvcc':nvcc_args_mha if not IS_ROCM_PYTORCH else hipcc_args_mha} - ) - ) - -if "--transducer" in sys.argv or "--cuda_ext" in sys.argv: - if "--transducer" in sys.argv: - sys.argv.remove("--transducer") - - if not IS_ROCM_PYTORCH: - raise_if_cuda_home_none("--transducer") - - ext_modules.append( - CUDAExtension( - name="transducer_joint_cuda", - sources=[ - "apex/contrib/csrc/transducer/transducer_joint.cpp", - "apex/contrib/csrc/transducer/transducer_joint_kernel.cu", - ], - extra_compile_args={ - "cxx": ["-O3"] + version_dependent_macros + generator_flag, - "nvcc": append_nvcc_threads(["-O3"] + version_dependent_macros + generator_flag) if not IS_ROCM_PYTORCH - else ["-O3"] + version_dependent_macros + generator_flag, - }, - include_dirs=[os.path.join(this_dir, "csrc"), os.path.join(this_dir, "apex/contrib/csrc/multihead_attn")], - ) - ) - ext_modules.append( - CUDAExtension( - name="transducer_loss_cuda", - sources=[ - "apex/contrib/csrc/transducer/transducer_loss.cpp", - "apex/contrib/csrc/transducer/transducer_loss_kernel.cu", - ], - include_dirs=[os.path.join(this_dir, "csrc")], - extra_compile_args={ - "cxx": ["-O3"] + version_dependent_macros, - "nvcc": append_nvcc_threads(["-O3"] + version_dependent_macros) if not IS_ROCM_PYTORCH - else ["-O3"] + version_dependent_macros, - }, - ) - ) +def get_op_build_env_name(op_name): + assert hasattr(ALL_OPS[op_name], 'BUILD_VAR'), \ + f"{op_name} is missing BUILD_VAR field" + return ALL_OPS[op_name].BUILD_VAR -# note (mkozuki): Now `--fast_bottleneck` option (i.e. apex/contrib/bottleneck) depends on `--peer_memory` and `--nccl_p2p`. -if "--fast_bottleneck" in sys.argv: - sys.argv.remove("--fast_bottleneck") - raise_if_cuda_home_none("--fast_bottleneck") - if check_cudnn_version_and_warn("--fast_bottleneck", 8400): - subprocess.run(["git", "submodule", "update", "--init", "apex/contrib/csrc/cudnn-frontend/"]) - ext_modules.append( - CUDAExtension( - name="fast_bottleneck", - sources=["apex/contrib/csrc/bottleneck/bottleneck.cpp"], - include_dirs=[os.path.join(this_dir, "apex/contrib/csrc/cudnn-frontend/include")], - extra_compile_args={"cxx": ["-O3"] + version_dependent_macros + generator_flag}, - ) - ) - -if "--peer_memory" in sys.argv or "--cuda_ext" in sys.argv: - if "--peer_memory" in sys.argv: - sys.argv.remove("--peer_memory") - - if not IS_ROCM_PYTORCH: - raise_if_cuda_home_none("--peer_memory") - - ext_modules.append( - CUDAExtension( - name="peer_memory_cuda", - sources=[ - "apex/contrib/csrc/peer_memory/peer_memory_cuda.cu", - "apex/contrib/csrc/peer_memory/peer_memory.cpp", - ], - extra_compile_args={"cxx": ["-O3"] + version_dependent_macros + generator_flag}, - ) - ) -if "--nccl_p2p" in sys.argv or "--cuda_ext" in sys.argv: - if "--nccl_p2p" in sys.argv: - sys.argv.remove("--nccl_p2p") - - if not IS_ROCM_PYTORCH: - raise_if_cuda_home_none("--nccl_p2p") - - ext_modules.append( - CUDAExtension( - name="nccl_p2p_cuda", - sources=[ - "apex/contrib/csrc/nccl_p2p/nccl_p2p_cuda.cu", - "apex/contrib/csrc/nccl_p2p/nccl_p2p.cpp", - ], - extra_compile_args={"cxx": ["-O3"] + version_dependent_macros + generator_flag}, - ) - ) +def op_build_enabled(op_name): + env_var = get_op_build_env_name(op_name) + return int(get_env_if_set(env_var, BUILD_OP_DEFAULT)) +def is_op_build_included(op_name): + #check if operation has BUILD_FLAG defined + assert hasattr(ALL_OPS[op_name], 'INCLUDE_FLAG'), \ + f"{op_name} is missing INCLUDE_FLAG field" + include_flag = ALL_OPS[op_name].INCLUDE_FLAG + return get_env_if_set(include_flag, False) -if "--fused_conv_bias_relu" in sys.argv: - sys.argv.remove("--fused_conv_bias_relu") - raise_if_cuda_home_none("--fused_conv_bias_relu") - if check_cudnn_version_and_warn("--fused_conv_bias_relu", 8400): - subprocess.run(["git", "submodule", "update", "--init", "apex/contrib/csrc/cudnn-frontend/"]) - ext_modules.append( - CUDAExtension( - name="fused_conv_bias_relu", - sources=["apex/contrib/csrc/conv_bias_relu/conv_bias_relu.cpp"], - include_dirs=[os.path.join(this_dir, "apex/contrib/csrc/cudnn-frontend/include")], - extra_compile_args={"cxx": ["-O3"] + version_dependent_macros + generator_flag}, - ) - ) - -#NCCL allocator is supported for apex 1.6 version and onwards -if TORCH_MAJOR == 2 and TORCH_MINOR >= 6: - if "--nccl_allocator" in sys.argv or "--cuda_ext" in sys.argv: - if "--nccl_allocator" in sys.argv: - sys.argv.remove("--nccl_allocator") - raise_if_cuda_home_none("--nccl_allocator") - _nccl_version_getter = load( - name="_nccl_version_getter", - sources=["apex/contrib/csrc/nccl_p2p/nccl_version.cpp", "apex/contrib/csrc/nccl_p2p/nccl_version_check.cu"], - ) - ccl_library = ["nccl"] - if IS_ROCM_PYTORCH: - ccl_library = ["rccl"] - _available_nccl_version = _nccl_version_getter.get_nccl_version() - if _available_nccl_version >= (2, 19): - ext_modules.append( - CUDAExtension( - name="_apex_nccl_allocator", - sources=[ - "apex/contrib/csrc/nccl_allocator/NCCLAllocator.cpp", - ], - include_dirs=[os.path.join(this_dir, "apex/apex/contrib/csrc/nccl_allocator")], - libraries=ccl_library, - extra_compile_args={"cxx": ["-O3"] + version_dependent_macros + generator_flag}, - ) - ) +ext_modules = [] +install_ops = dict.fromkeys(ALL_OPS.keys(), False) + +for op_name, builder in ALL_OPS.items(): + op_compatible = builder.is_compatible() + build_enabled = op_build_enabled(op_name) or is_op_build_included(op_name) + + # If op is requested but not available, throw an error. + if build_enabled and not op_compatible: + env_var = get_op_build_env_name(op_name) + builder.warning(f"Skip pre-compile of incompatible {op_name}; One can disable {op_name} with {env_var}=0") + continue + + # If op is compatible but install is not build enabled (JIT mode). + if IS_ROCM_PYTORCH and op_compatible and not build_enabled: + builder.hipify_extension() + + # If op build enabled, add builder to extensions. + # Also check if corresponding flags are checked + if build_enabled and op_compatible: + install_ops[op_name] = True + ext_modules.append(builder.builder()) + +print(f'Install Ops={install_ops}') + +# Write out version/git info. +git_hash_cmd = shlex.split("bash -c \"git rev-parse --short HEAD\"") +git_branch_cmd = shlex.split("bash -c \"git rev-parse --abbrev-ref HEAD\"") +if command_exists('git') and not is_env_set('APEX_BUILD_STRING'): + try: + result = subprocess.check_output(git_hash_cmd) + git_hash = result.decode('utf-8').strip() + result = subprocess.check_output(git_branch_cmd) + git_branch = result.decode('utf-8').strip() + except subprocess.CalledProcessError: + git_hash = "unknown" + git_branch = "unknown" +else: + git_hash = "unknown" + git_branch = "unknown" + +# Parse the apex version string from version.txt. +version_str = get_apex_version() +version_str += f'+{git_hash}' + +torch_version = ".".join([str(TORCH_MAJOR), str(TORCH_MINOR)]) +bf16_support = False +# Set cuda_version to 0.0 if cpu-only. +cuda_version = "0.0" +nccl_version = "0.0" +# Set hip_version to 0.0 if cpu-only. +hip_version = "0.0" +if torch.version.cuda is not None: + cuda_version = ".".join(torch.version.cuda.split('.')[:2]) + if sys.platform != "win32": + if isinstance(torch.cuda.nccl.version(), int): + # This will break if minor version > 9. + nccl_version = ".".join(str(torch.cuda.nccl.version())[:2]) else: - warnings.warn( - f"Skip `--nccl_allocator` as it requires NCCL 2.19 or later, but {_available_nccl_version[0]}.{_available_nccl_version[1]}" - ) - + nccl_version = ".".join(map(str, torch.cuda.nccl.version()[:2])) + if hasattr(torch.cuda, 'is_bf16_supported') and torch.cuda.is_available(): + bf16_support = torch.cuda.is_bf16_supported() +if hasattr(torch.version, 'hip') and torch.version.hip is not None: + hip_version = ".".join(torch.version.hip.split('.')[:2]) +torch_info = { + "version": torch_version, + "bf16_support": bf16_support, + "cuda_version": cuda_version, + "nccl_version": nccl_version, + "hip_version": hip_version +} + +print(f"version={version_str}, git_hash={git_hash}, git_branch={git_branch}") +with open('apex/git_version_info_installed.py', 'w') as fd: + fd.write(f"version='{version_str}'\n") + fd.write(f"git_hash='{git_hash}'\n") + fd.write(f"git_branch='{git_branch}'\n") + fd.write(f"installed_ops={install_ops}\n") + fd.write(f"build_flags={build_flags}\n") + fd.write(f"torch_info={torch_info}\n") +if "--cpp_ext" in sys.argv: + sys.argv.remove("--cpp_ext") if "--cuda_ext" in sys.argv: sys.argv.remove("--cuda_ext") @@ -982,16 +269,46 @@ def check_if_rocm_pytorch(): with open('requirements.txt') as f: required = f.read().splitlines() +# Find python files in compatibility folder +compatibility_dir = os.path.join(this_dir, 'compatibility') +py_modules = [] + +if os.path.exists(compatibility_dir): + for file in os.listdir(compatibility_dir): + if file.endswith('.py') and file != '__init__.py': + module_name = f"{file[:-3]}" + py_modules.append(module_name) + + #copy outside temporarily + src_file = os.path.join(compatibility_dir, file) + dst_file = os.path.join(this_dir, file) + shutil.copy2(src_file, dst_file) +else: + print("Warning: compatibility folder not found") + +class BinaryDistribution(Distribution): + """Force wheel to be platform-specific even without ext_modules.""" + def has_ext_modules(self): + return True + setup( name="apex", version=get_apex_version(), packages=find_packages( - exclude=("build", "csrc", "include", "tests", "dist", "docs", "tests", "examples", "apex.egg-info",) + exclude=("build", "include", "tests", "dist", "docs", "tests", "examples", "apex.egg-info", "op_builder", "compatibility") ), description="PyTorch Extensions written by NVIDIA", ext_modules=ext_modules, cmdclass={'build_ext': BuildExtension} if ext_modules else {}, extras_require=extras, - install_requires=required + install_requires=required, + include_package_data=True, + py_modules=py_modules, + distclass=BinaryDistribution ) +#delete the temporarily copied compatibility files +for py_module in py_modules: + path = dst_file = os.path.join(this_dir, py_module + ".py") + if os.path.exists(path): + os.remove(path) \ No newline at end of file diff --git a/tests/jit_build/build.sh b/tests/jit_build/build.sh new file mode 100644 index 000000000..1cb09af96 --- /dev/null +++ b/tests/jit_build/build.sh @@ -0,0 +1,62 @@ +#parse the arguments +JIT_CONDITION="$2" +echo "JIT_CONDITION $JIT_CONDITION" + +echo $(pwd) + +git checkout Refactor_build +git submodule update --init --recursive + +# uninstall apex +pip uninstall apex -y +make clean + +#install apex for different conditions +if [ "$JIT_CONDITION" = "1" ]; then + pip install . --no-build-isolation +elif [ "$JIT_CONDITION" = "2" ]; then + APEX_BUILD_CPP_OPS=1 pip install . --no-build-isolation +elif [ "$JIT_CONDITION" = "3" ]; then + APEX_BUILD_CUDA_OPS=1 pip install . --no-build-isolation +elif [ "$JIT_CONDITION" = "4" ]; then + APEX_BUILD_CPP_OPS=1 APEX_BUILD_CUDA_OPS=1 pip install . --no-build-isolation +elif [ "$JIT_CONDITION" = "5" ]; then + APEX_BUILD_FUSED_DENSE=1 pip install . --no-build-isolation +elif [ "$JIT_CONDITION" = "6" ]; then + python setup.py install --cpp_ext --cuda_ext +elif [ "$JIT_CONDITION" = "7" ]; then + APEX_BUILD_AMP_C=1 APEX_BUILD_APEX_C=1 APEX_BUILD_BNP=1 \ + APEX_BUILD_DISTRIBUTED_ADAM=1 APEX_BUILD_DISTRIBUTED_LAMB=1 APEX_BUILD_FAST_MULTIHEAD_ATTN=1 \ + APEX_BUILD_FOCAL_LOSS=1 APEX_BUILD_FUSED_ADAM=1 APEX_BUILD_FUSED_BIAS_SWIGLU=1 \ + APEX_BUILD_FUSED_DENSE=1 APEX_BUILD_FUSED_INDEX_MUL_2D=1 APEX_BUILD_FUSED_LAMB=1 \ + APEX_BUILD_FUSED_LAYER_NORM=1 APEX_BUILD_FUSED_ROPE=1 APEX_BUILD_FUSED_WEIGHT_GRADIENT_MLP=1 \ + APEX_BUILD_GENERIC_SCALED_MASKED_SOFTMAX_CUDA=1 APEX_BUILD_MLP=1 APEX_BUILD_NCCL_ALLOCATOR=1 \ + APEX_BUILD_NCCL_P2P=1 APEX_BUILD_PEER_MEMORY=1 APEX_BUILD_SCALED_MASKED_SOFTMAX_CUDA=1 \ + APEX_BUILD_SCALED_SOFTMAX_CUDA=1 APEX_BUILD_SCALED_UPPER_TRIANG_MASKED_SOFTMAX_CUDA=1 APEX_BUILD_SYNCBN=1 \ + APEX_BUILD_TRANSDUCER_JOINT=1 APEX_BUILD_TRANSDUCER_LOSS=1 APEX_BUILD_XENTROPY=1 pip install . --no-build-isolation +elif [ "$JIT_CONDITION" = "8" ]; then + python -m build --wheel --no-isolation . + pip install dist/apex-*.whl +elif [ "$JIT_CONDITION" = "9" ]; then + APEX_BUILD_CPP_OPS=1 python -m build --wheel --no-isolation . +elif [ "$JIT_CONDITION" = "10" ]; then + APEX_BUILD_CUDA_OPS=1 python -m build --wheel --no-isolation . + pip install dist/apex-*.whl +elif [ "$JIT_CONDITION" = "11" ]; then + APEX_BUILD_CPP_OPS=1 APEX_BUILD_CUDA_OPS=1 python -m build --wheel --no-isolation . + pip install dist/apex-*.whl +elif [ "$JIT_CONDITION" = "12" ]; then + APEX_BUILD_FUSED_DENSE=1 python -m build --wheel --no-isolation . + pip install dist/apex-*.whl +elif [ "$JIT_CONDITION" = "13" ]; then + APEX_BUILD_AMP_C=1 APEX_BUILD_APEX_C=1 APEX_BUILD_BNP=1 \ + APEX_BUILD_DISTRIBUTED_ADAM=1 APEX_BUILD_DISTRIBUTED_LAMB=1 APEX_BUILD_FAST_MULTIHEAD_ATTN=1 \ + APEX_BUILD_FOCAL_LOSS=1 APEX_BUILD_FUSED_ADAM=1 APEX_BUILD_FUSED_BIAS_SWIGLU=1 \ + APEX_BUILD_FUSED_DENSE=1 APEX_BUILD_FUSED_INDEX_MUL_2D=1 APEX_BUILD_FUSED_LAMB=1 \ + APEX_BUILD_FUSED_LAYER_NORM=1 APEX_BUILD_FUSED_ROPE=1 APEX_BUILD_FUSED_WEIGHT_GRADIENT_MLP=1 \ + APEX_BUILD_GENERIC_SCALED_MASKED_SOFTMAX_CUDA=1 APEX_BUILD_MLP=1 APEX_BUILD_NCCL_ALLOCATOR=1 \ + APEX_BUILD_NCCL_P2P=1 APEX_BUILD_PEER_MEMORY=1 APEX_BUILD_SCALED_MASKED_SOFTMAX_CUDA=1 \ + APEX_BUILD_SCALED_SOFTMAX_CUDA=1 APEX_BUILD_SCALED_UPPER_TRIANG_MASKED_SOFTMAX_CUDA=1 APEX_BUILD_SYNCBN=1 \ + APEX_BUILD_TRANSDUCER_JOINT=1 APEX_BUILD_TRANSDUCER_LOSS=1 APEX_BUILD_XENTROPY=1 python -m build --wheel --no-isolation . + pip install dist/apex-*.whl +fi \ No newline at end of file diff --git a/tests/jit_build/build_test.sh b/tests/jit_build/build_test.sh new file mode 100644 index 000000000..5e61b696c --- /dev/null +++ b/tests/jit_build/build_test.sh @@ -0,0 +1,5 @@ +#parse the arguments +JIT_CONDITION="$2" + +sh tests/jit_build/build.sh "condition" $JIT_CONDITION +sh tests/jit_build/run_tests.sh "condition" $JIT_CONDITION \ No newline at end of file diff --git a/tests/jit_build/count_built_so.py b/tests/jit_build/count_built_so.py new file mode 100644 index 000000000..353034acb --- /dev/null +++ b/tests/jit_build/count_built_so.py @@ -0,0 +1,11 @@ +import glob +import os +import site + + +SITE_PACKAGES_FOLDERS = site.getsitepackages()[0] + +#count the number of *.so files in the folder +so_files = glob.glob(os.path.join(SITE_PACKAGES_FOLDERS, "apex/*.so"), recursive=True) +count = len(so_files) +print(count) diff --git a/tests/jit_build/count_failed_unit_tests.py b/tests/jit_build/count_failed_unit_tests.py new file mode 100644 index 000000000..c6d95d3ea --- /dev/null +++ b/tests/jit_build/count_failed_unit_tests.py @@ -0,0 +1,16 @@ +import sys + +test_file = sys.argv[1] + +#read lines from test file +with open(test_file, "r") as f: + lines = f.readlines() + +failed_tests = [] +for line in lines: + if "ERROR: " in line: + failed_tests.append(line[7:].strip()) + if " FAILED" in line and "#" not in line: + failed_tests.append(line[: -8].strip()) +print(len(failed_tests)) +#print(str(len(failed_tests)) + "," + ";".join(failed_tests)) \ No newline at end of file diff --git a/tests/jit_build/count_torch_extensions.py b/tests/jit_build/count_torch_extensions.py new file mode 100644 index 000000000..3c8a9fda3 --- /dev/null +++ b/tests/jit_build/count_torch_extensions.py @@ -0,0 +1,9 @@ +import os + +import torch.utils.cpp_extension + +torch_ext_directory = torch.utils.cpp_extension._get_build_directory("", False) +#count the number of folders +folders = [f for f in os.listdir(torch_ext_directory) if os.path.isdir(os.path.join(torch_ext_directory, f))] +count = len(folders) +print(count) \ No newline at end of file diff --git a/tests/jit_build/docker/base.ubuntu.amd.Dockerfile b/tests/jit_build/docker/base.ubuntu.amd.Dockerfile new file mode 100644 index 000000000..b825ba05e --- /dev/null +++ b/tests/jit_build/docker/base.ubuntu.amd.Dockerfile @@ -0,0 +1,3 @@ +# CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'} +ARG BASE_DOCKER=rocm/pytorch +FROM $BASE_DOCKER \ No newline at end of file diff --git a/tests/jit_build/load_extra_extensions.py b/tests/jit_build/load_extra_extensions.py new file mode 100644 index 000000000..16d25d2f8 --- /dev/null +++ b/tests/jit_build/load_extra_extensions.py @@ -0,0 +1,16 @@ +from apex.op_builder.fused_lamb import FusedLambBuilder +from apex.op_builder.generic_scaled_masked_softmax_cuda import GenericScaledMaskedSoftmaxCudaBuilder +from apex.op_builder.scaled_softmax_cuda import ScaledSoftmaxCudaBuilder +from apex.op_builder.nccl_p2p import NCCLP2PBuilder + +''' +generic_scaled_masked_softmax_cuda +scaled_softmax_cuda +fused_lamb_cuda +nccl_p2p_cuda +''' + +FusedLambBuilder().load() +GenericScaledMaskedSoftmaxCudaBuilder().load() +ScaledSoftmaxCudaBuilder().load() +NCCLP2PBuilder().load() \ No newline at end of file diff --git a/tests/jit_build/models.json b/tests/jit_build/models.json new file mode 100644 index 000000000..72963295b --- /dev/null +++ b/tests/jit_build/models.json @@ -0,0 +1,158 @@ +[ + { + "name": "apex_jit_install_condition1", + "dockerfile": "docker/base", + "scripts": "scripts", + "n_gpus": "8", + "owner": "skishore@amd.com", + "training_precision": "", + "tags": [ + "apex_jit" + ], + "args": "--condition 1" + }, + { + "name": "apex_jit_install_condition2", + "dockerfile": "docker/base", + "scripts": "scripts", + "n_gpus": "8", + "owner": "skishore@amd.com", + "training_precision": "", + "tags": [ + "apex_jit" + ], + "args": "--condition 2" + }, + { + "name": "apex_jit_install_condition3", + "dockerfile": "docker/base", + "scripts": "scripts", + "n_gpus": "8", + "owner": "skishore@amd.com", + "training_precision": "", + "tags": [ + "apex_jit" + ], + "args": "--condition 3" + }, + { + "name": "apex_jit_install_condition4", + "dockerfile": "docker/base", + "scripts": "scripts", + "n_gpus": "8", + "owner": "skishore@amd.com", + "training_precision": "", + "tags": [ + "apex_jit" + ], + "args": "--condition 4" + }, + { + "name": "apex_jit_install_condition5", + "dockerfile": "docker/base", + "scripts": "scripts", + "n_gpus": "8", + "owner": "skishore@amd.com", + "training_precision": "", + "tags": [ + "apex_jit" + ], + "args": "--condition 5" + }, + { + "name": "apex_jit_install_condition6", + "dockerfile": "docker/base", + "scripts": "scripts", + "n_gpus": "8", + "owner": "skishore@amd.com", + "training_precision": "", + "tags": [ + "apex_jit" + ], + "args": "--condition 6" + }, + { + "name": "apex_jit_install_condition7", + "dockerfile": "docker/base", + "scripts": "scripts", + "n_gpus": "8", + "owner": "skishore@amd.com", + "training_precision": "", + "tags": [ + "apex_jit" + ], + "args": "--condition 7" + }, + { + "name": "apex_jit_install_condition8", + "dockerfile": "docker/base", + "scripts": "scripts", + "n_gpus": "8", + "owner": "skishore@amd.com", + "training_precision": "", + "tags": [ + "apex_jit" + ], + "args": "--condition 8" + }, + { + "name": "apex_jit_install_condition9", + "dockerfile": "docker/base", + "scripts": "scripts", + "n_gpus": "8", + "owner": "skishore@amd.com", + "training_precision": "", + "tags": [ + "apex_jit" + ], + "args": "--condition 9" + }, + { + "name": "apex_jit_install_condition10", + "dockerfile": "docker/base", + "scripts": "scripts", + "n_gpus": "8", + "owner": "skishore@amd.com", + "training_precision": "", + "tags": [ + "apex_jit" + ], + "args": "--condition 10" + }, + { + "name": "apex_jit_install_condition11", + "dockerfile": "docker/base", + "scripts": "scripts", + "n_gpus": "8", + "owner": "skishore@amd.com", + "training_precision": "", + "tags": [ + "apex_jit" + ], + "args": "--condition 11" + }, + { + "name": "apex_jit_install_condition12", + "dockerfile": "docker/base", + "scripts": "scripts", + "n_gpus": "8", + "owner": "skishore@amd.com", + "training_precision": "", + "tags": [ + "apex_jit" + ], + "args": "--condition 12" + }, + { + "name": "apex_jit_install_condition13", + "dockerfile": "docker/base", + "scripts": "scripts", + "n_gpus": "8", + "owner": "skishore@amd.com", + "training_precision": "", + "tags": [ + "apex_jit" + ], + "args": "--condition 13" + } +] \ No newline at end of file diff --git a/tests/jit_build/run_tests.sh b/tests/jit_build/run_tests.sh new file mode 100644 index 000000000..eaed64629 --- /dev/null +++ b/tests/jit_build/run_tests.sh @@ -0,0 +1,36 @@ +#parse the arguments +JIT_CONDITION="$2" +echo "JIT_CONDITION $JIT_CONDITION" + +#run the apex unit tests +LOG_FILE=results_jit_unit_test${JIT_CONDITION}.log +LOG_FILE2=results_jit_unit_test${JIT_CONDITION}c.log + +cd tests/L0 +PYTHONUNBUFFERED=1 sh run_rocm.sh 2>&1 | tee ../../$LOG_FILE +cd ../../ + +cd apex/contrib/test +PYTHONUNBUFFERED=1 python run_rocm_extensions.py 2>&1 | tee ../../../$LOG_FILE2 +cd ../../../ + +torchrun --nproc_per_node 8 apex/contrib/peer_memory/peer_halo_exchange_module_tests.py 2>&1 | tee -a $LOG_FILE2 + +cd tests/distributed/synced_batchnorm +sh unit_test.sh 2>&1 | tee -a ../../../$LOG_FILE2 +cd ../../../ + +#explicitly load the builder and build the remaining extensions +python tests/jit_build/load_extra_extensions.py 2>&1 | tee $LOG_FILE + +FAILED_TESTS=$(python tests/jit_build/count_failed_unit_tests.py $LOG_FILE) +FAILED_TESTS2=$(python tests/jit_build/count_failed_unit_tests.py $LOG_FILE2) +BUILT_SO_COUNT=$(python tests/jit_build/count_built_so.py) +TORCH_EXTENSIONS_COUNT=$(python tests/jit_build/count_torch_extensions.py) + +echo "Failed L0 tests = $FAILED_TESTS" +echo "Failed contrib tests = $FAILED_TESTS2" +echo ".so count = $BUILT_SO_COUNT" +echo "JIT torch extensions count = $TORCH_EXTENSIONS_COUNT" + +echo "$FAILED_TESTS $FAILED_TESTS2 $BUILT_SO_COUNT $TORCH_EXTENSIONS_COUNT" \ No newline at end of file diff --git a/tests/jit_build/scripts/run.sh b/tests/jit_build/scripts/run.sh new file mode 100644 index 000000000..aeb41fadd --- /dev/null +++ b/tests/jit_build/scripts/run.sh @@ -0,0 +1,25 @@ +#parse the arguments +JIT_CONDITION="$2" + +echo $(pwd) + +WORKSPACE_DIR=/myworkspace +mkdir -p $WORKSPACE_DIR + +cd $WORKSPACE_DIR +git clone https://github.com/rocm/apex.git --recursive +cd apex +git checkout Refactor_build +git submodule update --init --recursive + +sh tests/jit_build/build.sh "condition" $JIT_CONDITION + +# Capture the output from run_tests.sh +TEST_RESULTS=$(sh tests/jit_build/run_tests.sh "condition" $JIT_CONDITION | tail -1) + +# Parse the returned values +read FAILED_TESTS FAILED_TESTS2 BUILT_SO_COUNT TORCH_EXTENSIONS_COUNT <<< "$TEST_RESULTS" + +MULTIPLE_RESULTS_FILE="../results_jit_unit_test.csv" +#echo "condition,failed unit tests" > "$MULTIPLE_RESULTS_FILE" +echo "$JIT_CONDITION,$FAILED_TESTS,$FAILED_TESTS2,$BUILT_SO_COUNT,$TORCH_EXTENSIONS_COUNT" >> "$MULTIPLE_RESULTS_FILE" \ No newline at end of file diff --git a/tests/test_extension_import.py b/tests/test_extension_import.py index 153254ddd..72d88688e 100644 --- a/tests/test_extension_import.py +++ b/tests/test_extension_import.py @@ -2,15 +2,17 @@ import os import subprocess import sys - +import site +import ast +from apex.op_builder.all_ops import ALL_OPS class TestExtensionImport(unittest.TestCase): - def get_extensions_list(self): - """ - This method reads setup.py and gets the list of extensions from the setup.py file - """ + def __init__(self, *args, **kwargs): + super(TestExtensionImport, self).__init__(*args, **kwargs) + + self.jit_info_file = "apex/git_version_info_installed.py" #find the absolute path of this file current_file_path = os.path.abspath(__file__) @@ -21,9 +23,24 @@ def get_extensions_list(self): #apex folder parent_folder_path = os.path.dirname(parent_folder_path) self.parent_folder_path = parent_folder_path + + def is_jit_modules_mode(self): + """ + This method checks if the file git_version_info_installed.py exists + """ + jit_file_path = os.path.join(site.getsitepackages()[0], self.jit_info_file) + #print ("jit_file_path", jit_file_path) + mode = os.path.exists(jit_file_path) + print ("jit_mode", mode) + return mode + + def get_extensions_list_from_setup(self): + """ + This method reads setup.py and gets the list of extensions from the setup.py file + """ #get setup.py file contents - setup_path = os.path.join(parent_folder_path, "setup.py") + setup_path = os.path.join(self.parent_folder_path, "setup.py") #read setup_path contents with open(setup_path, 'r') as f: @@ -62,6 +79,21 @@ def get_extensions_list(self): return extensions + def get_jit_modules(self): + """ + This method reads the jit file and extracts installed_ops dictionary + """ + jit_info_path = os.path.join(site.getsitepackages()[0], self.jit_info_file) + with open(jit_info_path, 'r') as f: + lines = f.readlines() + for line in lines: + if "installed_ops" in line: + ops_list = line[len("installed_ops") + 1 : ] + ops_list = ast.literal_eval(ops_list) + #print ("op_list", ops_list) + return list(ops_list.keys()) + return {} + def get_environment(self): """ This method retrieves the environment for testing import @@ -122,10 +154,46 @@ def check_extension_import(self, extension_name, env): print(f"Error testing import for {extension_name}: {e}") return False, str(e) + def check_jit_extension_import(self, extension_name, env): + all_ops = dict.fromkeys(ALL_OPS.keys(), False) + #get the builder for that extension + builder = ALL_OPS[extension_name] + builder_name = type(builder).__name__ + #print ("----builder_name-----", builder_name) + + #increase timeout + timeout = 60 * 60 + try: + # Run Python subprocess to test the import + result = subprocess.run([ + sys.executable, '-c', + 'from apex.op_builder import ' + builder_name + + '\n' + builder_name + "().load()" + ], capture_output=True, text=True, timeout=timeout, env=env) + print ("result.stdout", result.stdout, result.stderr) + # Check if subprocess completed successfully + if result.returncode != 0 and "Error" in result.stderr: + return False, result.stderr + else: + return True, "" + + except subprocess.TimeoutExpired: + print(f"Import test timed out for {extension_name}") + return False, "Timeout" + except Exception as e: + print(f"Error testing import for {extension_name}: {e}") + return False, str(e) + def test_extensions_import(self): - #get the list of extensions - extensions = self.get_extensions_list() + #check the extensions mode + jit_mode = self.is_jit_modules_mode() + + if not jit_mode: + #get the list of extensions from setup.py + extensions = self.get_extensions_list_from_setup() + else: + extensions = self.get_jit_modules() #get environment env = self.get_environment() @@ -135,7 +203,10 @@ def test_extensions_import(self): for extension in extensions: print ("checking extension", extension) with self.subTest(extension=extension): - success, error_message = self.check_extension_import(extension, env) + if not jit_mode: + success, error_message = self.check_extension_import(extension, env) + else: + success, error_message = self.check_jit_extension_import(extension, env) #self.assertTrue(success, f"Failed to import extension: {extension}") results.append((extension, success, error_message))