Skip to content

Commit fc55d8e

Browse files
committed
CI: Unified multi-stage Docker build
Signed-off-by: Alexey Rivkin <[email protected]>
1 parent 38097e7 commit fc55d8e

File tree

4 files changed

+93
-202
lines changed

4 files changed

+93
-202
lines changed

.ci/jenkins/lib/build-matrix.yaml

Lines changed: 42 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
# Key Components:
77
# - Job Configuration: Defines timeout, failure behavior, and Kubernetes resources
88
# - Docker Images: Specifies the container images used for different build stages
9-
# - cuda-dl-base images (25.06 for Ubuntu 24.04, 24.10 for Ubuntu 22.04) for building and testing
9+
# - nixl-deps-base images (built from contrib/Dockerfile) with pre-installed dependencies (UCX, libfabric, etc.)
1010
# - Podman image for container builds
11-
# - Matrix Axes: Defines build variations (currently x86_64 architecture)
11+
# - Matrix Axes: Defines build variations (x86_64 and aarch64 architectures)
1212
# - Build Steps: Sequential steps for building, testing, and container creation
1313
#
1414
# When Modified:
@@ -22,6 +22,10 @@
2222
---
2323
job: nixl-ci-build
2424

25+
registry_host: harbor.mellanox.com
26+
registry_auth: swx-infra_harbor_credentials
27+
registry_path: /swx-infra/media
28+
2529
# Fail job if one of the steps fails or continue
2630
failFast: false
2731

@@ -34,8 +38,42 @@ kubernetes:
3438
requests: "{memory: 8Gi, cpu: 8000m}"
3539

3640
runs_on_dockers:
37-
- { name: "ubuntu24.04-cuda-dl-base", url: "nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04" }
38-
- { name: "ubuntu22.04-cuda-dl-base", url: "nvcr.io/nvidia/cuda-dl-base:24.10-cuda12.6-devel-ubuntu22.04" }
41+
- {
42+
file: "contrib/Dockerfile",
43+
name: "ubuntu24.04-nixl-deps",
44+
uri: "nixl/$arch/$name",
45+
tag: "latest",
46+
arch: "x86_64",
47+
build_args: "--target nixl-deps-base",
48+
category: "base"
49+
}
50+
- {
51+
file: "contrib/Dockerfile",
52+
name: "ubuntu24.04-nixl-deps",
53+
uri: "nixl/$arch/$name",
54+
tag: "latest",
55+
arch: "aarch64",
56+
build_args: "--target nixl-deps-base",
57+
category: "base"
58+
}
59+
- {
60+
file: "contrib/Dockerfile",
61+
name: "ubuntu22.04-nixl-deps",
62+
uri: "nixl/$arch/$name",
63+
tag: "latest",
64+
arch: "x86_64",
65+
build_args: "--target nixl-deps-base --build-arg BASE_IMAGE_TAG=24.10-cuda12.6-devel-ubuntu22.04",
66+
category: "base"
67+
}
68+
- {
69+
file: "contrib/Dockerfile",
70+
name: "ubuntu22.04-nixl-deps",
71+
uri: "nixl/$arch/$name",
72+
tag: "latest",
73+
arch: "aarch64",
74+
build_args: "--target nixl-deps-base --build-arg BASE_IMAGE_TAG=24.10-cuda12.6-devel-ubuntu22.04",
75+
category: "base"
76+
}
3977
- { name: "podman-v5.0.2", url: "quay.io/podman/stable:v5.0.2", category: 'tool', privileged: true }
4078

4179
matrix:

.gitlab/build.sh

Lines changed: 14 additions & 186 deletions
Original file line numberDiff line numberDiff line change
@@ -14,36 +14,27 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616

17+
# Simplified build script for CI using nixl-deps-base image
18+
# Dependencies (UCX, libfabric, etcd, aws-sdk, rust, DOCA) are pre-installed in the base image
19+
1720
# shellcheck disable=SC1091
1821
. "$(dirname "$0")/../.ci/scripts/common.sh"
1922

2023
set -e
2124
set -x
2225
set -o pipefail
2326

24-
# Parse commandline arguments with first argument being the install directory
25-
# and second argument being the UCX installation directory.
27+
# Parse commandline arguments
2628
INSTALL_DIR=$1
27-
UCX_INSTALL_DIR=$2
28-
EXTRA_BUILD_ARGS=${3:-""}
29-
# UCX_VERSION is the version of UCX to build override default with env variable.
30-
UCX_VERSION=${UCX_VERSION:-v1.19.0}
31-
# LIBFABRIC_VERSION is the version of libfabric to build override default with env variable.
32-
LIBFABRIC_VERSION=${LIBFABRIC_VERSION:-v2.3.0}
33-
# LIBFABRIC_INSTALL_DIR can be set via environment variable, defaults to INSTALL_DIR
34-
LIBFABRIC_INSTALL_DIR=${LIBFABRIC_INSTALL_DIR:-$INSTALL_DIR}
29+
EXTRA_BUILD_ARGS=${2:-""}
30+
LIBFABRIC_INSTALL_DIR=${LIBFABRIC_INSTALL_DIR:-/usr/local}
3531

3632
if [ -z "$INSTALL_DIR" ]; then
37-
echo "Usage: $0 <install_dir> <ucx_install_dir>"
33+
echo "Usage: $0 <install_dir> [extra_build_args]"
3834
exit 1
3935
fi
4036

41-
if [ -z "$UCX_INSTALL_DIR" ]; then
42-
UCX_INSTALL_DIR=$INSTALL_DIR
43-
fi
44-
45-
46-
# For running as user - check if running as root, if not set sudo variable
37+
# For running as user - check if running as root
4738
if [ "$(id -u)" -ne 0 ]; then
4839
SUDO=sudo
4940
else
@@ -53,187 +44,24 @@ fi
5344
ARCH=$(uname -m)
5445
[ "$ARCH" = "arm64" ] && ARCH="aarch64"
5546

56-
# Some docker images are with broken installations:
57-
$SUDO rm -rf /usr/lib/cmake/grpc /usr/lib/cmake/protobuf
58-
59-
$SUDO apt-get -qq update
60-
$SUDO apt-get -qq install -y python3-dev \
61-
python3-pip \
62-
curl \
63-
wget \
64-
libnuma-dev \
65-
numactl \
66-
autotools-dev \
67-
automake \
68-
git \
69-
libtool \
70-
libz-dev \
71-
libiberty-dev \
72-
flex \
73-
build-essential \
74-
cmake \
75-
libgoogle-glog-dev \
76-
libgtest-dev \
77-
libgmock-dev \
78-
libjsoncpp-dev \
79-
libpython3-dev \
80-
libboost-all-dev \
81-
libssl-dev \
82-
libgrpc-dev \
83-
libgrpc++-dev \
84-
libprotobuf-dev \
85-
libcpprest-dev \
86-
libaio-dev \
87-
liburing-dev \
88-
meson \
89-
ninja-build \
90-
pkg-config \
91-
protobuf-compiler-grpc \
92-
pybind11-dev \
93-
etcd-server \
94-
net-tools \
95-
iproute2 \
96-
pciutils \
97-
libpci-dev \
98-
uuid-dev \
99-
libibmad-dev \
100-
doxygen \
101-
clang \
102-
hwloc \
103-
libhwloc-dev \
104-
libcurl4-openssl-dev zlib1g-dev # aws-sdk-cpp dependencies
105-
106-
# Ubuntu 22.04 specific setup
107-
if grep -q "Ubuntu 22.04" /etc/os-release 2>/dev/null; then
108-
# Upgrade pip for '--break-system-packages' support
109-
$SUDO pip3 install --upgrade pip
110-
111-
# Upgrade meson (distro version 0.61.2 is too old, project requires >= 0.64.0)
112-
$SUDO pip3 install --upgrade meson
113-
# Ensure pip3's meson takes precedence over apt's version
114-
export PATH="$HOME/.local/bin:/usr/local/bin:$PATH"
115-
fi
116-
117-
# Add DOCA repository and install packages
118-
ARCH_SUFFIX=$(if [ "${ARCH}" = "aarch64" ]; then echo "arm64"; else echo "amd64"; fi)
119-
MELLANOX_OS="$(. /etc/lsb-release; echo ${DISTRIB_ID}${DISTRIB_RELEASE} | tr A-Z a-z | tr -d .)"
120-
wget --tries=3 --waitretry=5 --no-verbose https://www.mellanox.com/downloads/DOCA/DOCA_v3.1.0/host/doca-host_3.1.0-091000-25.07-${MELLANOX_OS}_${ARCH_SUFFIX}.deb -O doca-host.deb
121-
$SUDO dpkg -i doca-host.deb
122-
$SUDO apt-get update
123-
$SUDO apt-get upgrade -y
124-
$SUDO apt-get install -y --no-install-recommends doca-sdk-gpunetio libdoca-sdk-gpunetio-dev libdoca-sdk-verbs-dev
125-
126-
# Force reinstall of RDMA packages from DOCA repository
127-
# Reinstall needed to fix broken libibverbs-dev, which may lead to lack of Infiniband support.
128-
# Upgrade is not sufficient if the version is the same since apt skips the installation.
129-
$SUDO apt-get -qq -y install \
130-
--reinstall libibverbs-dev rdma-core ibverbs-utils libibumad-dev \
131-
libnuma-dev librdmacm-dev ibverbs-providers
132-
133-
wget --tries=3 --waitretry=5 https://static.rust-lang.org/rustup/dist/${ARCH}-unknown-linux-gnu/rustup-init
134-
chmod +x rustup-init
135-
./rustup-init -y --default-toolchain 1.86.0
136-
export PATH="$HOME/.cargo/bin:$PATH"
137-
138-
wget --tries=3 --waitretry=5 "https://astral.sh/uv/install.sh" -O install_uv.sh
139-
chmod +x install_uv.sh
140-
./install_uv.sh
141-
export PATH="$HOME/.local/bin:$PATH"
142-
143-
curl -fSsL "https://github.com/openucx/ucx/tarball/${UCX_VERSION}" | tar xz
144-
( \
145-
cd openucx-ucx* && \
146-
./autogen.sh && \
147-
./configure \
148-
--prefix="${UCX_INSTALL_DIR}" \
149-
--enable-shared \
150-
--disable-static \
151-
--disable-doxygen-doc \
152-
--enable-optimizations \
153-
--enable-cma \
154-
--enable-devel-headers \
155-
--with-verbs \
156-
--with-dm \
157-
${UCX_CUDA_BUILD_ARGS} \
158-
--enable-mt && \
159-
make -j && \
160-
make -j install-strip && \
161-
$SUDO ldconfig \
162-
)
163-
164-
wget --tries=3 --waitretry=5 -O "libfabric-${LIBFABRIC_VERSION#v}.tar.bz2" "https://github.com/ofiwg/libfabric/releases/download/${LIBFABRIC_VERSION}/libfabric-${LIBFABRIC_VERSION#v}.tar.bz2"
165-
tar xjf "libfabric-${LIBFABRIC_VERSION#v}.tar.bz2"
166-
rm "libfabric-${LIBFABRIC_VERSION#v}.tar.bz2"
167-
( \
168-
cd libfabric-* && \
169-
./autogen.sh && \
170-
./configure --prefix="${LIBFABRIC_INSTALL_DIR}" \
171-
--disable-verbs \
172-
--disable-psm3 \
173-
--disable-opx \
174-
--disable-usnic \
175-
--disable-rstream \
176-
--enable-efa && \
177-
make -j && \
178-
make install && \
179-
$SUDO ldconfig \
180-
)
181-
182-
( \
183-
cd /tmp && \
184-
git clone --depth 1 https://github.com/etcd-cpp-apiv3/etcd-cpp-apiv3.git && \
185-
cd etcd-cpp-apiv3 && \
186-
mkdir build && cd build && \
187-
cmake .. && \
188-
make -j"$NPROC" && \
189-
$SUDO make install && \
190-
$SUDO ldconfig \
191-
)
192-
193-
( \
194-
cd /tmp && \
195-
git clone --recurse-submodules --depth 1 --shallow-submodules https://github.com/aws/aws-sdk-cpp.git --branch 1.11.581 && \
196-
mkdir aws_sdk_build && \
197-
cd aws_sdk_build && \
198-
cmake ../aws-sdk-cpp/ -DCMAKE_BUILD_TYPE=Release -DBUILD_ONLY="s3" -DENABLE_TESTING=OFF -DCMAKE_INSTALL_PREFIX=/usr/local && \
199-
make -j"$NPROC" && \
200-
$SUDO make install
201-
)
202-
203-
( \
204-
cd /tmp && \
205-
git clone https://github.com/nvidia/gusli.git && \
206-
cd gusli && \
207-
$SUDO make all BUILD_RELEASE=1 BUILD_FOR_UNITEST=0 VERBOSE=1 ALLOW_USE_URING=0 && \
208-
$SUDO ldconfig
209-
)
210-
211-
( \
212-
cd /tmp &&
213-
git clone --depth 1 https://github.com/google/gtest-parallel.git &&
214-
mkdir -p ${INSTALL_DIR}/bin &&
215-
cp gtest-parallel/* ${INSTALL_DIR}/bin/
216-
)
217-
47+
# Set library and binary paths
21848
export LD_LIBRARY_PATH="${INSTALL_DIR}/lib:${INSTALL_DIR}/lib/$ARCH-linux-gnu:${INSTALL_DIR}/lib64:$LD_LIBRARY_PATH:${LIBFABRIC_INSTALL_DIR}/lib"
21949
export CPATH="${INSTALL_DIR}/include:${LIBFABRIC_INSTALL_DIR}/include:$CPATH"
22050
export PATH="${INSTALL_DIR}/bin:$PATH"
22151
export PKG_CONFIG_PATH="${INSTALL_DIR}/lib/pkgconfig:${INSTALL_DIR}/lib64/pkgconfig:${INSTALL_DIR}:${LIBFABRIC_INSTALL_DIR}/lib/pkgconfig:$PKG_CONFIG_PATH"
22252
export NIXL_PLUGIN_DIR="${INSTALL_DIR}/lib/$ARCH-linux-gnu/plugins"
22353
export CMAKE_PREFIX_PATH="${INSTALL_DIR}:${CMAKE_PREFIX_PATH}"
22454

225-
# Disabling CUDA IPC not to use NVLINK, as it slows down local
226-
# UCX transfers and can cause contention with local collectives.
55+
# Disabling CUDA IPC not to use NVLINK
22756
export UCX_TLS=^cuda_ipc
22857

58+
# Build NIXL
22959
# shellcheck disable=SC2086
230-
meson setup nixl_build --prefix=${INSTALL_DIR} -Ducx_path=${UCX_INSTALL_DIR} -Dbuild_docs=true -Drust=false ${EXTRA_BUILD_ARGS} -Dlibfabric_path="${LIBFABRIC_INSTALL_DIR}"
60+
meson setup nixl_build --prefix=${INSTALL_DIR} -Ducx_path=/usr -Dbuild_docs=true -Drust=false ${EXTRA_BUILD_ARGS} -Dlibfabric_path="${LIBFABRIC_INSTALL_DIR}"
61+
ninja -j"$NPROC" -C nixl_build && ninja -j"$NPROC" -C nixl_build install
23162
mkdir -p dist && cp nixl_build/src/bindings/python/nixl-meta/nixl-*.whl dist/
232-
ninja -j${NPROC:-$(nproc)} -C nixl_build && ninja -j${NPROC:-$(nproc)} -C nixl_build install
233-
234-
# TODO(kapila): Copy the nixl.pc file to the install directory if needed.
235-
# cp ${BUILD_DIR}/nixl.pc ${INSTALL_DIR}/lib/pkgconfig/nixl.pc
23663

64+
# Build nixlbench
23765
cd benchmark/nixlbench
23866
meson setup nixlbench_build -Dnixl_path=${INSTALL_DIR} -Dprefix=${INSTALL_DIR}
23967
ninja -j"$NPROC" -C nixlbench_build && ninja -j"$NPROC" -C nixlbench_build install

.gitlab/test_python.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,8 @@ export NIXL_PREFIX=${INSTALL_DIR}
4141
export NIXL_DEBUG_LOGGING=yes
4242

4343
# Control ninja parallelism during pip build to prevent OOM (NPROC from common.sh)
44-
pip3 install --break-system-packages --config-settings=compile-args="-j${NPROC:-$(nproc)}" .
45-
pip3 install --break-system-packages dist/nixl-*none-any.whl
4644
pip3 install --break-system-packages --config-settings=compile-args="-j${NPROC}" .
45+
pip3 install --break-system-packages dist/nixl-*none-any.whl
4746
pip3 install --break-system-packages pytest
4847
pip3 install --break-system-packages pytest-timeout
4948
pip3 install --break-system-packages zmq

0 commit comments

Comments
 (0)