diff --git a/deps.svg b/deps.svg index 068a7931..98efbe62 100644 --- a/deps.svg +++ b/deps.svg @@ -1,2238 +1,2418 @@ - - + - - + + -n120 - -Alpine: autoconf +n65 + +Alpine: autoconf - - -n27 - -libtirpc + + +n33 + +libtirpc - + -n120->n27 - - +n65->n33 + + - + -n30 - -Alpine: bash - - - -n25 - -elfutils - - - -n30->n25 - - - - - -n47 - -glibc - - - -n30->n47 - - +n27 + +Alpine: bash - - -n24 - -libcap + + +n22 + +elfutils - - -n30->n24 - - + + +n27->n22 + + - - -n23 - -libseccomp + + +n51 + +glibc - + -n30->n23 - - +n27->n51 + + - + + +n31 + +libcap + + -n30->n27 - - +n27->n31 + + - - -n43 - -nvidia-container-cli-lts + + +n32 + +libseccomp + + + +n27->n32 + + - + -n30->n43 - - +n27->n33 + + - - -n20 - -nvidia-container-cli-production + + +n84 + +nvidia-container-cli-lts - - -n30->n20 - - + + +n27->n84 + + - - -n45 - -nvidia-container-runtime + + +n30 + +nvidia-container-cli-production - + -n30->n45 - - +n27->n30 + + - - -n44 - -nvidia-pkgs-lts + + +n34 + +nvidia-container-runtime - - -n30->n44 - - + + +n27->n34 + + - - -n22 - -nvidia-pkgs-production + + +n39 + +nvidia-fabricmanager-wrapper - + + +n27->n39 + + + + + +n85 + +nvidia-pkgs-lts + + -n30->n22 - - +n27->n85 + + - - -n26 - -zlib + + +n50 + +nvidia-pkgs-production - - -n30->n26 - - + + +n27->n50 + + - + + +n24 + +zlib + + + +n27->n24 + + + + -n100 - -Alpine: bison +n131 + +Alpine: bison - - -n100->n47 - - + + +n131->n51 + + - + -n28 - -Alpine: build-base +n25 + +Alpine: build-base - + -n28->n25 - - - - - -n28->n47 - - +n25->n22 + + - - -n28->n24 - - + + +n25->n51 + + - - -n28->n23 - - + + +n25->n31 + + - - -n28->n27 - - + + +n25->n32 + + - + -n28->n43 - - +n25->n33 + + - - -n28->n20 - - + + +n25->n84 + + - + -n28->n45 - - +n25->n30 + + - - -n28->n26 - - + + +n25->n34 + + - + + +n25->n39 + + + + + +n25->n24 + + + + -n128 - -Alpine: cargo +n125 + +Alpine: cargo - - -n126 - -xen-guest-agent + + +n123 + +xen-guest-agent - - -n128->n126 - - + + +n125->n123 + + - + -n130 - -Alpine: clang-dev +n127 + +Alpine: clang-dev - - -n130->n126 - - + + +n127->n123 + + - + -n32 - -Alpine: coreutils +n138 + +Alpine: cmake - - -n32->n43 - - - - - -n32->n20 - - + + +n138->n39 + + - + -n34 - -Alpine: curl +n67 + +Alpine: coreutils - - -n34->n43 - - + + +n67->n84 + + - - -n34->n20 - - + + +n67->n30 + + - + -n99 - -Alpine: gawk +n69 + +Alpine: curl - - -n99->n47 - - + + +n69->n84 + + - + + +n69->n30 + + + + -n29 - -Alpine: gcc-14 +n130 + +Alpine: gawk - - -n29->n25 - - + + +n130->n51 + + - - -n29->n47 - - + + +n26 + +Alpine: gcc-14 - - -n29->n24 - - + + +n26->n22 + + - - -n29->n23 - - + + +n26->n51 + + - - -n29->n27 - - + + +n26->n31 + + - - -n29->n43 - - + + +n26->n32 + + - - -n29->n20 - - + + +n26->n33 + + - - -n29->n45 - - + + +n26->n84 + + - + -n29->n26 - - +n26->n30 + + - - -n31 - -Alpine: go + + +n26->n34 + + - - -n31->n43 - - + + +n26->n39 + + - + -n31->n20 - - +n26->n24 + + - - -n31->n45 - - - - + -n103 - -Alpine: gperf +n66 + +Alpine: go - - -n103->n23 - - + + +n66->n84 + + - + + +n66->n30 + + + + + +n66->n34 + + + + -n131 - -Alpine: llvm-dev +n141 + +Alpine: go-1.24 - - -n131->n126 - - + + +n141->n39 + + - + -n42 - -Alpine: m4 +n90 + +Alpine: gperf - - -n42->n25 - - + + +n90->n32 + + - + -n36 - -Alpine: patch +n140 + +Alpine: libnl3-dev - + -n36->n24 - - +n140->n39 + + - - -n36->n43 - - - - - -n36->n20 - - - - - -n36->n45 - - - - + -n132 - -Alpine: pkgconf +n128 + +Alpine: llvm-dev - + -n132->n126 - - +n128->n123 + + - + -n101 - -Alpine: python3 +n28 + +Alpine: m4 - + -n101->n47 - - +n28->n22 + + - + -n35 - -Alpine: rpcsvc-proto +n139 + +Alpine: ninja - - -n35->n43 - - - - + -n35->n20 - - +n139->n39 + + - + -n33 - -Alpine: sed +n71 + +Alpine: patch - - -n33->n43 - - + + +n71->n31 + + - + -n33->n20 - - +n71->n84 + + + + + +n71->n30 + + + + + +n71->n34 + + n129 - -Alpine: xen-dev + +Alpine: pkgconf - - -n129->n126 - - + + +n129->n123 + + - + -n127 - -alpine +n132 + +Alpine: python3 - - -n127->n126 - - + + +n132->n51 + + - - -n122 - -amazon-ena + + +n132->n39 + + - + + +n70 + +Alpine: rpcsvc-proto + + + +n70->n84 + + + + + +n70->n30 + + + + -n4 - -amd-ucode +n68 + +Alpine: sed - - -n86 - -amdgpu + + +n68->n84 + + - - -n2 - -base + + +n68->n30 + + - - -n2->n122 - - + + +n126 + +Alpine: xen-dev - - -n2->n4 - - + + +n126->n123 + + - - -n2->n86 - - + + +n124 + +alpine - - -n133 - -binfmt-misc + + +n124->n123 + + - - -n2->n133 - - + + +n54 + +amazon-ena - + -n40 - -bnx2-bnx2x +n135 + +amd-ucode - - -n2->n40 - - - - + -n70 - -btrfs +n46 + +amdgpu - - -n2->n70 - - + + +n2 + +base - + + +n2->n54 + + + + + +n2->n135 + + + + + +n2->n46 + + + + -n65 - -chelsio-drivers +n58 + +binfmt-misc - - -n2->n65 - - + + +n2->n58 + + - + -n98 - -chelsio-firmware +n1 + +bnx2-bnx2x - - -n2->n98 - - + + +n2->n1 + + - + -n102 - -cloudflared - - - -n2->n102 - - +n142 + +btrfs - - -n104 - -crun - - - -n2->n104 - - + + +n2->n142 + + - + -n110 - -ctr +n117 + +chelsio-drivers - - -n2->n110 - - + + +n2->n117 + + - + -n89 - -drbd +n62 + +chelsio-firmware - - -n2->n89 - - + + +n2->n62 + + - + -n16 - -dvb-cx23885 +n137 + +cloudflared - - -n2->n16 - - + + +n2->n137 + + - + -n15 - -dvb-m88ds3103 +n133 + +crun - - -n2->n15 - - + + +n2->n133 + + - + -n48 - -ecr-credential-provider - - - -n2->n48 - - - - - -n71 - -fuse3 - - - -n2->n71 - - - - - -n73 - -gasket-driver +n86 + +ctr - - -n2->n73 - - + + +n2->n86 + + - + n52 - -glib + +drbd - + n2->n52 - - + + - - -n80 - -gvisor + + +n110 + +dvb-cx23885 - - -n2->n80 - - + + +n2->n110 + + - - -n49 - -gvisor-debug + + +n47 + +dvb-m88ds3103 - - -n2->n49 - - + + +n2->n47 + + - - -n82 - -hailort + + +n43 + +ecr-credential-provider - - -n2->n82 - - + + +n2->n43 + + - - -n66 - -hello-world-service + + +n100 + +fuse3 - - -n2->n66 - - + + +n2->n100 + + - + + +n73 + +gasket-driver + + + +n2->n73 + + + + + +n4 + +glib + + + +n2->n4 + + + + -n108 - -i915 +n116 + +gvisor - - -n2->n108 - - + + +n2->n116 + + - + -n91 - -intel-ice-firmware +n21 + +gvisor-debug - - -n2->n91 - - + + +n2->n21 + + - + -n17 - -intel-ucode +n16 + +hailort - - -n2->n17 - - + + +n2->n16 + + - + -n11 - -iscsi-tools +n134 + +hello-world-service - - -n2->n11 - - + + +n2->n134 + + - + -n81 - -kata-containers +n96 + +i915 - - -n2->n81 - - + + +n2->n96 + + - - -n60 - -libevent + + +n76 + +intel-ice-firmware - - -n2->n60 - - + + +n2->n76 + + - - -n112 - -libnvme + + +n111 + +intel-ucode - - -n2->n112 - - + + +n2->n111 + + - - -n63 - -libtirpc-zfs + + +n9 + +iscsi-tools - - -n2->n63 - - + + +n2->n9 + + - - -n53 - -lldpd + + +n93 + +kata-containers - - -n2->n53 - - + + +n2->n93 + + - - -n58 - -mdadm + + +n72 + +libevent - - -n2->n58 - - + + +n2->n72 + + - - -n124 - -mei + + +n42 + +libnvme - - -n2->n124 - - + + +n2->n42 + + - + -n87 - -metal-agent +n14 + +libtirpc-zfs - - -n2->n87 - - + + +n2->n14 + + - + -n116 - -nebula +n112 + +lldpd - - -n2->n116 - - + + +n2->n112 + + - + -n134 - -newt +n11 + +mdadm - - -n2->n134 - - + + +n2->n11 + + - + -n121 - -nfsd +n56 + +mei - - -n2->n121 - - + + +n2->n56 + + - + -n107 - -nfsrahead +n80 + +metal-agent - - -n2->n107 - - + + +n2->n80 + + - + -n118 - -nonfree-kmod-nvidia-lts +n113 + +nebula - + -n2->n118 - - +n2->n113 + + - + -n92 - -nonfree-kmod-nvidia-production +n102 + +netbird - - -n2->n92 - - + + +n2->n102 + + - + -n10 - -nut-client +n6 + +newt - - -n2->n10 - - + + +n2->n6 + + - - -n55 - -nvidia-container-runtime-wrapper + + +n115 + +nfsd - - -n2->n55 - - + + +n2->n115 + + - - -n69 - -nvidia-container-toolkit-lts + + +n119 + +nfsrahead - - -n2->n69 - - + + +n2->n119 + + - - -n54 - -nvidia-container-toolkit-production + + +n97 + +nonfree-kmod-nvidia-lts - - -n2->n54 - - + + +n2->n97 + + - - -n94 - -nvidia-fabricmanager-lts + + +n63 + +nonfree-kmod-nvidia-production - - -n2->n94 - - + + +n2->n63 + + - - -n77 - -nvidia-fabricmanager-production + + +n59 + +nut-client - - -n2->n77 - - + + +n2->n59 + + - + -n78 - -nvidia-open-gpu-kernel-modules-lts +n35 + +nvidia-container-runtime-wrapper - - -n2->n78 - - + + +n2->n35 + + - + -n37 - -nvidia-open-gpu-kernel-modules-production +n99 + +nvidia-container-toolkit-lts - - -n2->n37 - - + + +n2->n99 + + - + -n46 - -nvidia-persistenced-wrapper +n29 + +nvidia-container-toolkit-production - - -n2->n46 - - + + +n2->n29 + + - - -n114 - -nvme-cli + + +n37 + +nvidia-fabricmanager-lts - - -n2->n114 - - + + +n2->n37 + + - - -n12 - -open-iscsi + + +n77 + +nvidia-fabricmanager-production - - -n2->n12 - - + + +n2->n77 + + - + -n115 - -panfrost +n7 + +nvidia-open-gpu-kernel-modules-lts - - -n2->n115 - - + + +n2->n7 + + - + -n19 - -pcre2 +n91 + +nvidia-open-gpu-kernel-modules-production - - -n2->n19 - - + + +n2->n91 + + - + -n105 - -qemu-guest-agent - - - -n2->n105 - - - - - -n109 - -qlogic-firmware - - - -n2->n109 - - - - - -n56 - -realtek-firmware +n36 + +nvidia-persistenced-wrapper - - -n2->n56 - - + + +n2->n36 + + - + -n18 - -revpi-firmware +n40 + +nvme-cli - - -n2->n18 - - + + +n2->n40 + + - + -n39 - -spin +n10 + +open-iscsi - - -n2->n39 - - + + +n2->n10 + + - + -n61 - -sqlite +n143 + +panfrost - - -n2->n61 - - + + +n2->n143 + + - + -n117 - -stargz-snapshotter +n5 + +pcre2 - - -n2->n117 - - + + +n2->n5 + + - + -n8 - -tailscale +n136 + +qemu-guest-agent - - -n2->n8 - - + + +n2->n136 + + - + -n84 - -tenstorrent +n57 + +qlogic-firmware - - -n2->n84 - - + + +n2->n57 + + - + -n75 - -thunderbolt +n79 + +realtek-firmware - - -n2->n75 - - + + +n2->n79 + + - + -n125 - -uinput +n122 + +revpi-firmware - - -n2->n125 - - + + +n2->n122 + + - + -n76 - -usb-modem-drivers +n106 + +spin - - -n2->n76 - - + + +n2->n106 + + - + -n97 - -util-linux-tools +n60 + +sqlite - - -n2->n97 - - + + +n2->n60 + + - + -n1 - -v4l-uvc-drivers +n120 + +stargz-snapshotter - - -n2->n1 - - + + +n2->n120 + + - + -n14 - -vc4 +n114 + +tailscale - - -n2->n14 - - + + +n2->n114 + + - + -n67 - -vmtoolsd-guest-agent +n44 + +tenstorrent - - -n2->n67 - - + + +n2->n44 + + - + -n13 - -wasmedge +n18 + +thunderbolt - - -n2->n13 - - + + +n2->n18 + + - + -n50 - -xdma-driver +n107 + +uinput - - -n2->n50 - - + + +n2->n107 + + - - -n2->n126 - - + + +n75 + +usb-audio-drivers - + + +n2->n75 + + + + -n72 - -youki +n121 + +usb-modem-drivers - - -n2->n72 - - + + +n2->n121 + + - + -n106 - -zerotier +n78 + +util-linux-tools - - -n2->n106 - - + + +n2->n78 + + - + -n41 - -zerotier-wrapper +n20 + +v4l-uvc-drivers - - -n2->n41 - - + + +n2->n20 + + - + -n95 - -zfs +n101 + +vc4 - - -n2->n95 - - + + +n2->n101 + + - + -n9 - -zfs-service +n108 + +vmtoolsd-guest-agent - - -n2->n9 - - + + +n2->n108 + + - + -n62 - -zfs-tools +n88 + +wasmedge - - -n2->n62 - - + + +n2->n88 + + - - -n64 - -zlib-zfs + + +n94 + +xdma-driver - - -n2->n64 - - + + +n2->n94 + + - - -n21 - -cgr.dev/chainguard/wolfi-base + + +n118 + +xe - - -n21->n25 - - + + +n2->n118 + + - - -n21->n47 - - + + +n2->n123 + + - - -n21->n24 - - + + +n61 + +youki - - -n21->n23 - - + + +n2->n61 + + - - -n21->n27 - - + + +n48 + +zerotier - - -n21->n43 - - + + +n2->n48 + + - - -n21->n20 - - + + +n49 + +zerotier-wrapper - - -n21->n45 - - + + +n2->n49 + + - - -n21->n44 - - + + +n103 + +zfs - - -n21->n22 - - + + +n2->n103 + + - - -n21->n26 - - + + +n105 + +zfs-service - - -n16->n15 - - + + +n2->n105 + + - - -n25->n43 - - + + +n13 + +zfs-tools - - -n25->n20 - - + + +n2->n13 + + - - -n25->n69 - - + + +n15 + +zlib-zfs - - -n25->n54 - - + + +n2->n15 + + - - -n57 - -extensions + + +n23 + +cgr.dev/chainguard/wolfi-base - - -n111 - -ghcr.io/siderolabs/containerd:v1.12.0-alpha.0-17-gc4faa38 + + +n23->n22 + + - - -n111->n110 - - + + +n23->n51 + + - - -n90 - -ghcr.io/siderolabs/drbd-pkg:v1.12.0-alpha.0-17-gc4faa38 + + +n23->n31 + + - + -n90->n89 - - +n23->n32 + + - - -n123 - -ghcr.io/siderolabs/ena-pkg:v1.12.0-alpha.0-17-gc4faa38 - - - -n123->n122 - - + + +n23->n33 + + - - -n7 - -ghcr.io/siderolabs/extensions-validator:fe85801 + + +n23->n84 + + - - -n7->n2 - - + + +n23->n30 + + - - -n74 - -ghcr.io/siderolabs/gasket-driver-pkg:v1.12.0-alpha.0-17-gc4faa38 + + +n23->n34 + + - - -n74->n73 - - + + +n38 + +nvidia-fabricmanager-gcc-runtime - - -n83 - -ghcr.io/siderolabs/hailort-pkg:v1.12.0-alpha.0-17-gc4faa38 + + +n23->n38 + + - - -n83->n82 - - + + +n23->n39 + + - - -n3 - -ghcr.io/siderolabs/kernel:v1.12.0-alpha.0-17-gc4faa38 + + +n23->n85 + + - - -n3->n86 - - + + +n23->n50 + + - - -n3->n133 - - + + +n23->n24 + + - + -n3->n70 - - +n22->n84 + + - + -n3->n65 - - +n22->n30 + + - - -n3->n16 - - + + +n22->n99 + + - - -n3->n15 - - + + +n22->n29 + + - - -n3->n108 - - + + +n89 + +extensions - - -n3->n124 - - + + +n87 + +ghcr.io/siderolabs/containerd:v1.12.0-alpha.0-41-g661e578 - - -n3->n121 - - + + +n87->n86 + + - - -n3->n75 - - + + +n53 + +ghcr.io/siderolabs/drbd-pkg:v1.12.0-alpha.0-41-g661e578 - - -n3->n125 - - + + +n53->n52 + + - - -n3->n76 - - + + +n55 + +ghcr.io/siderolabs/ena-pkg:v1.12.0-alpha.0-41-g661e578 - - -n3->n1 - - + + +n55->n54 + + - + -n113 - -ghcr.io/siderolabs/libjson-c:v1.12.0-alpha.0-17-gc4faa38 - - - -n113->n112 - - +n83 + +ghcr.io/siderolabs/extensions-validator:fe85801 - - -n113->n114 - - + + +n83->n2 + + - + -n5 - -ghcr.io/siderolabs/linux-firmware:v1.12.0-alpha.0-17-gc4faa38 +n74 + +ghcr.io/siderolabs/gasket-driver-pkg:v1.12.0-alpha.0-41-g661e578 - - -n5->n4 - - + + +n74->n73 + + - - -n5->n86 - - + + +n17 + +ghcr.io/siderolabs/hailort-pkg:v1.12.0-alpha.0-41-g661e578 - - -n5->n40 - - + + +n17->n16 + + - - -n5->n98 - - + + +n19 + +ghcr.io/siderolabs/kernel:v1.12.0-alpha.0-41-g661e578 - - -n5->n108 - - + + +n19->n46 + + - - -n5->n91 - - + + +n19->n58 + + - - -n5->n109 - - + + +n19->n142 + + - - -n5->n56 - - + + +n19->n117 + + - - -n119 - -ghcr.io/siderolabs/nonfree-kmod-nvidia-lts-pkg:v1.12.0-alpha.0-17-gc4faa38 + + +n19->n110 + + - - -n119->n118 - - + + +n19->n96 + + - - -n93 - -ghcr.io/siderolabs/nonfree-kmod-nvidia-production-pkg:v1.12.0-alpha.0-17-gc4faa38 + + +n19->n56 + + - - -n93->n92 - - + + +n19->n115 + + - - -n79 - -ghcr.io/siderolabs/nvidia-open-gpu-kernel-modules-lts-pkg:v1.12.0-alpha.0-17-gc4faa38 + + +n19->n143 + + - - -n79->n78 - - + + +n19->n18 + + - - -n38 - -ghcr.io/siderolabs/nvidia-open-gpu-kernel-modules-production-pkg:v1.12.0-alpha.0-17-gc4faa38 + + +n19->n107 + + - - -n38->n37 - - + + +n19->n75 + + - - -n59 - -ghcr.io/siderolabs/systemd-udevd:v1.12.0-alpha.0-17-gc4faa38 + + +n19->n121 + + - - -n59->n58 - - + + +n19->n20 + + - - -n88 - -ghcr.io/siderolabs/talos-metal-agent:v0.1.3 + + +n19->n101 + + - - -n88->n87 - - + + +n19->n118 + + - - -n68 - -ghcr.io/siderolabs/talos-vmtoolsd:v1.3.0 + + +n41 + +ghcr.io/siderolabs/libjson-c:v1.12.0-alpha.0-41-g661e578 - + -n68->n67 - - +n41->n42 + + - - -n85 - -ghcr.io/siderolabs/tenstorrent-pkg:v1.12.0-alpha.0-17-gc4faa38 + + +n41->n40 + + - - -n85->n84 - - + + +n3 + +ghcr.io/siderolabs/linux-firmware:v1.12.0-alpha.0-41-g661e578 - - -n6 - -ghcr.io/siderolabs/tools:v1.12.0-alpha.0-5-g7c659e9 + + +n3->n135 + + - + -n6->n2 - - - - - -n51 - -ghcr.io/siderolabs/xdma-driver-pkg:v1.12.0-alpha.0-17-gc4faa38 - - - -n51->n50 - - +n3->n46 + + - - -n96 - -ghcr.io/siderolabs/zfs-pkg:v1.12.0-alpha.0-17-gc4faa38 + + +n3->n1 + + - + -n96->n95 - - +n3->n62 + + - + + +n3->n96 + + + + -n52->n105 - - +n3->n76 + + - - -n47->n44 - - + + +n3->n143 + + - - -n47->n22 - - + + +n3->n57 + + - - -n24->n43 - - + + +n3->n79 + + - + -n24->n20 - - +n3->n118 + + + + + +n98 + +ghcr.io/siderolabs/nonfree-kmod-nvidia-lts-pkg:v1.12.0-alpha.0-41-g661e578 - + -n24->n69 - - +n98->n97 + + - - -n24->n54 - - + + +n64 + +ghcr.io/siderolabs/nonfree-kmod-nvidia-production-pkg:v1.12.0-alpha.0-41-g661e578 - + -n60->n107 - - +n64->n63 + + + + + +n8 + +ghcr.io/siderolabs/nvidia-open-gpu-kernel-modules-lts-pkg:v1.12.0-alpha.0-41-g661e578 - + -n112->n114 - - +n8->n7 + + + + + +n92 + +ghcr.io/siderolabs/nvidia-open-gpu-kernel-modules-production-pkg:v1.12.0-alpha.0-41-g661e578 - + -n112->n114 - - +n92->n91 + + - - -n23->n43 - - + + +n12 + +ghcr.io/siderolabs/systemd-udevd:v1.12.0-alpha.0-41-g661e578 - + -n23->n20 - - +n12->n11 + + - - -n23->n69 - - + + +n81 + +ghcr.io/siderolabs/talos-metal-agent:v0.1.3 + + + +n81->n80 + + + + + +n109 + +ghcr.io/siderolabs/talos-vmtoolsd:v1.4.0 - + -n23->n54 - - +n109->n108 + + - - -n27->n43 - - + + +n45 + +ghcr.io/siderolabs/tenstorrent-pkg:v1.12.0-alpha.0-41-g661e578 + + + +n45->n44 + + - + + +n82 + +ghcr.io/siderolabs/tools:v1.12.0-alpha.0-15-ge62d613 + + -n27->n20 - - +n82->n2 + + - - -n27->n69 - - + + +n95 + +ghcr.io/siderolabs/xdma-driver-pkg:v1.12.0-alpha.0-41-g661e578 - + + +n95->n94 + + + + + +n104 + +ghcr.io/siderolabs/zfs-pkg:v1.12.0-alpha.0-41-g661e578 + + -n27->n54 - - +n104->n103 + + - - -n63->n107 - - + + +n4->n136 + + - + -n63->n95 - - +n51->n85 + + - + -n63->n62 - - - - - -n43->n69 - - +n51->n50 + + - + -n20->n54 - - +n31->n84 + + - - -n45->n69 - - + + +n31->n30 + + - + -n45->n54 - - +n31->n99 + + - - -n55->n69 - - + + +n31->n29 + + - - -n55->n54 - - + + +n72->n119 + + - - -n46->n69 - - + + +n42->n40 + + - - -n46->n54 - - + + +n42->n40 + + - + -n44->n43 - - +n32->n84 + + - - -n22->n20 - - + + +n32->n30 + + - - -n12->n11 - - + + +n32->n99 + + - - -n19->n52 - - + + +n32->n29 + + - + -n19->n105 - - +n33->n84 + + - + + +n33->n30 + + + + -n61->n107 - - +n33->n99 + + - - -n1->n16 - - + + +n33->n29 + + - + + +n14->n119 + + + + -n41->n106 - - +n14->n103 + + - - -n9->n95 - - + + +n14->n13 + + - + -n62->n95 - - +n84->n99 + + - - -n26->n25 - - + + +n30->n29 + + - + -n26->n43 - - +n34->n99 + + - - -n26->n20 - - + + +n34->n29 + + - + -n26->n69 - - +n35->n99 + + - + -n26->n54 - - +n35->n29 + + + + + +n38->n37 + + - + -n64->n95 - - +n38->n77 + + + + + +n39->n37 + + + + + +n39->n77 + + + + + +n36->n99 + + + + + +n36->n29 + + - - -n64->n62 - - + + +n85->n84 + + + + + +n50->n30 + + + + + +n10->n9 + + + + + +n5->n4 + + + + + +n5->n136 + + + + + +n60->n119 + + + + + +n20->n110 + + + + + +n49->n48 + + + + + +n105->n103 + + + + + +n13->n103 + + + + + +n24->n22 + + + + + +n24->n84 + + + + + +n24->n30 + + + + + +n24->n99 + + + + + +n24->n29 + + + + + +n15->n103 + + + + + +n15->n13 + + diff --git a/go.work b/go.work index 9e4c3f7f..ad1c2cf1 100644 --- a/go.work +++ b/go.work @@ -5,4 +5,5 @@ use ( ./examples/hello-world-service/src ./nvidia-gpu/nvidia-container-toolkit/nvidia-container-runtime-wrapper ./nvidia-gpu/nvidia-container-toolkit/nvidia-persistenced-wrapper + ./nvidia-gpu/nvidia-fabricmanager/nvidia-fabricmanager-wrapper ) diff --git a/nvidia-gpu/nvidia-fabricmanager/lts/nvidia-fabricmanager.yaml b/nvidia-gpu/nvidia-fabricmanager/lts/nvidia-fabricmanager.yaml index f252c5af..36435196 100644 --- a/nvidia-gpu/nvidia-fabricmanager/lts/nvidia-fabricmanager.yaml +++ b/nvidia-gpu/nvidia-fabricmanager/lts/nvidia-fabricmanager.yaml @@ -1,10 +1,7 @@ # https://docs.nvidia.com/datacenter/tesla/pdf/fabric-manager-user-guide.pdf name: nvidia-fabricmanager container: - entrypoint: /usr/local/bin/nv-fabricmanager - args: - - --config - - /usr/local/share/nvidia/nvswitch/fabricmanager.cfg + entrypoint: /usr/bin/nvidia-fabricmanager-wrapper mounts: # device files - source: /dev @@ -28,44 +25,40 @@ container: options: - bind - ro - # nvidia libraries - - source: /usr/local/lib - destination: /usr/local/lib - type: bind - options: - - bind - - ro # service state file + # * nvlsm: + # - pid file that can't be disabled + # - unix socket /var/run/nvidia-fabricmanager/fm_sm_ipc.socket + # can't be changed, path is hardcoded into fabricmanager + # * fabricmanager + # - state file + # - database files - source: /var/run/nvidia-fabricmanager - destination: /var/run/nvidia-fabricmanager + destination: /var/run type: bind options: - rshared - rbind - rw - # log files - - source: /var/log - destination: /var/log + # service cache file + # * nvlsm: database files + - source: /var/cache/nvidia-fabricmanager + destination: /var/cache type: bind options: - rshared - rbind - rw - # fabric topology files - - source: /usr/local/share/nvidia/nvswitch - destination: /usr/local/share/nvidia/nvswitch + # service log files + # * nvlsm: + # - mandatory dump files hardcoded to /var/log/, so /var/log must be writable + - source: /var/log/nvidia-fabricmanager + destination: /var/log type: bind options: - rshared - rbind - - ro - # binaries - - source: /usr/local/bin - destination: /usr/local/bin - type: bind - options: - - bind - - ro + - rw depends: - service: cri # we need to depend on udevd so that the nvidia device files are created diff --git a/nvidia-gpu/nvidia-fabricmanager/lts/pkg.yaml b/nvidia-gpu/nvidia-fabricmanager/lts/pkg.yaml index 3fb61c52..65a1721b 100644 --- a/nvidia-gpu/nvidia-fabricmanager/lts/pkg.yaml +++ b/nvidia-gpu/nvidia-fabricmanager/lts/pkg.yaml @@ -2,7 +2,9 @@ name: nvidia-fabricmanager-lts variant: scratch shell: /bin/bash dependencies: - - stage: base + - stage: base + - stage: nvidia-fabricmanager-gcc-runtime + - stage: nvidia-fabricmanager-wrapper steps: - sources: # {{ if eq .ARCH "aarch64" }} This in fact is YAML comment, but Go templating instruction is evaluated by bldr @@ -10,40 +12,69 @@ steps: destination: fabricmanager.tar.xz sha256: {{ .NVIDIA_FABRIC_MANAGER_LTS_ARM64_SHA256 }} sha512: {{ .NVIDIA_FABRIC_MANAGER_LTS_ARM64_SHA512 }} + - url: https://developer.download.nvidia.com/compute/cuda/redist/nvlsm/linux-sbsa/nvlsm-linux-sbsa-{{ .NVIDIA_NVLSM_LTS_VERSION }}-archive.tar.xz + destination: nvlsm.tar.xz + sha256: {{ .NVIDIA_NVLSM_LTS_ARM64_SHA256 }} + sha512: {{ .NVIDIA_NVLSM_LTS_ARM64_SHA512 }} # {{ else }} This in fact is YAML comment, but Go templating instruction is evaluated by bldr - url: https://developer.download.nvidia.com/compute/nvidia-driver/redist/fabricmanager/linux-x86_64/fabricmanager-linux-x86_64-{{ .NVIDIA_DRIVER_LTS_VERSION }}-archive.tar.xz destination: fabricmanager.tar.xz sha256: {{ .NVIDIA_FABRIC_MANAGER_LTS_AMD64_SHA256 }} sha512: {{ .NVIDIA_FABRIC_MANAGER_LTS_AMD64_SHA512 }} + - url: https://developer.download.nvidia.com/compute/cuda/redist/nvlsm/linux-x86_64/nvlsm-linux-x86_64-{{ .NVIDIA_NVLSM_LTS_VERSION }}-archive.tar.xz + destination: nvlsm.tar.xz + sha256: {{ .NVIDIA_NVLSM_LTS_AMD64_SHA256 }} + sha512: {{ .NVIDIA_NVLSM_LTS_AMD64_SHA512 }} # {{ end }} This in fact is YAML comment, but Go templating instruction is evaluated by bldr prepare: - | - tar -xf fabricmanager.tar.xz --strip-components=1 + mkdir fm sm + tar -xf fabricmanager.tar.xz --strip-components=1 -C fm + tar -xf nvlsm.tar.xz --strip-components=1 -C sm install: - | - mkdir -p /rootfs/usr/local/bin \ - /rootfs/usr/local/lib \ - /rootfs/usr/local/share/nvidia/nvswitch \ - /rootfs/usr/local/lib/containers/nvidia-fabricmanager \ - /rootfs/usr/local/etc/containers + mkdir -p /rootfs/usr/local/lib/containers/nvidia-fabricmanager/usr/bin \ + /rootfs/usr/local/lib/containers/nvidia-fabricmanager/usr/lib \ + /rootfs/usr/local/lib/containers/nvidia-fabricmanager/usr/share/nvidia/nvswitch \ + /rootfs/usr/local/lib/containers/nvidia-fabricmanager/opt/nvidia/nvlsm/sbin \ + /rootfs/usr/local/lib/containers/nvidia-fabricmanager/opt/nvidia/nvlsm/lib \ + /rootfs/usr/local/lib/containers/nvidia-fabricmanager/usr/share/nvidia/nvlsm + # nvlsm + - | + cp sm/sbin/nvlsm /rootfs/usr/local/lib/containers/nvidia-fabricmanager/opt/nvidia/nvlsm/sbin/ - cp lib/libnvfm.so.1 /rootfs/usr/local/lib/libnvfm.so.1 - ln -s libnvfm.so.1 /rootfs/usr/local/lib/libnvfm.so + cp sm/lib/libgrpc_mgr.so /rootfs/usr/local/lib/containers/nvidia-fabricmanager/opt/nvidia/nvlsm/lib/ - cp bin/nv-fabricmanager /rootfs/usr/local/bin/ - cp bin/nvswitch-audit /rootfs/usr/local/bin/ + cp sm/share/nvidia/nvlsm/device_configuration.conf \ + sm/share/nvidia/nvlsm/grpc_mgr.conf \ + sm/share/nvidia/nvlsm/nvlsm.conf \ + /rootfs/usr/local/lib/containers/nvidia-fabricmanager/usr/share/nvidia/nvlsm/ + # fabricmanager + - | + cp fm/bin/nv-fabricmanager \ + fm/bin/nvswitch-audit \ + /rootfs/usr/local/lib/containers/nvidia-fabricmanager/usr/bin/ - cp share/nvidia/nvswitch/dgx2_hgx2_topology /rootfs/usr/local/share/nvidia/nvswitch/ - cp share/nvidia/nvswitch/dgxa100_hgxa100_topology /rootfs/usr/local/share/nvidia/nvswitch/ + cp fm/lib/libnvfm.so.1 /rootfs/usr/local/lib/containers/nvidia-fabricmanager/usr/lib/ + ln -s libnvfm.so.1 /rootfs/usr/local/lib/containers/nvidia-fabricmanager/usr/lib/libnvfm.so - cp etc/fabricmanager.cfg /rootfs/usr/local/share/nvidia/nvswitch/ + cp fm/share/nvidia/nvswitch/* \ + fm/etc/fabricmanager.cfg \ + /rootfs/usr/local/lib/containers/nvidia-fabricmanager/usr/share/nvidia/nvswitch/ - cp /pkg/nvidia-fabricmanager.yaml /rootfs/usr/local/etc/containers/nvidia-fabricmanager.yaml + sed -i 's/DAEMONIZE=.*/DAEMONIZE=0/g' /rootfs/usr/local/lib/containers/nvidia-fabricmanager/usr/share/nvidia/nvswitch/fabricmanager.cfg + sed -i 's/LOG_FILE_NAME=.*/LOG_FILE_NAME=/g' /rootfs/usr/local/lib/containers/nvidia-fabricmanager/usr/share/nvidia/nvswitch/fabricmanager.cfg + sed -i 's#STATE_FILE_NAME=.*#STATE_FILE_NAME=/var/run/fabricmanager.state#g' /rootfs/usr/local/lib/containers/nvidia-fabricmanager/usr/share/nvidia/nvswitch/fabricmanager.cfg - sed -i 's/DAEMONIZE=.*/DAEMONIZE=0/g' /rootfs/usr/local/share/nvidia/nvswitch/fabricmanager.cfg - sed -i 's/STATE_FILE_NAME=.*/STATE_FILE_NAME=\/var\/run\/nvidia-fabricmanager\/fabricmanager.state/g' /rootfs/usr/local/share/nvidia/nvswitch/fabricmanager.cfg - sed -i 's/TOPOLOGY_FILE_PATH=.*/TOPOLOGY_FILE_PATH=\/usr\/local\/share\/nvidia\/nvswitch/g' /rootfs/usr/local/share/nvidia/nvswitch/fabricmanager.cfg - sed -i 's/DATABASE_PATH=.*/DATABASE_PATH=\/usr\/local\/share\/nvidia\/nvswitch/g' /rootfs/usr/local/share/nvidia/nvswitch/fabricmanager.cfg + if grep -q '^DATABASE_PATH=' /rootfs/usr/local/lib/containers/nvidia-fabricmanager/usr/share/nvidia/nvswitch/fabricmanager.cfg + then + sed -i 's#DATABASE_PATH=.*#DATABASE_PATH=/var/run#g' /rootfs/usr/local/lib/containers/nvidia-fabricmanager/usr/share/nvidia/nvswitch/fabricmanager.cfg + else + echo -e '\nDATABASE_PATH=/var/run\n' >>/rootfs/usr/local/lib/containers/nvidia-fabricmanager/usr/share/nvidia/nvswitch/fabricmanager.cfg + fi + - | + mkdir -p /rootfs/usr/local/etc/containers + cp /pkg/nvidia-fabricmanager.yaml /rootfs/usr/local/etc/containers/nvidia-fabricmanager.yaml test: - | mkdir -p /extensions-validator-rootfs diff --git a/nvidia-gpu/nvidia-fabricmanager/nvidia-fabricmanager-gcc-runtime/pkg.yaml b/nvidia-gpu/nvidia-fabricmanager/nvidia-fabricmanager-gcc-runtime/pkg.yaml new file mode 100644 index 00000000..f6f6a9d3 --- /dev/null +++ b/nvidia-gpu/nvidia-fabricmanager/nvidia-fabricmanager-gcc-runtime/pkg.yaml @@ -0,0 +1,12 @@ +name: nvidia-fabricmanager-gcc-runtime +variant: scratch +dependencies: + - image: cgr.dev/chainguard/wolfi-base@{{ .WOLFI_BASE_REF }} +steps: + - install: + - | + mkdir -p /rootfs/usr/local/lib/containers/nvidia-fabricmanager/opt/nvidia/nvlsm/lib + cp /usr/lib/libgcc_s.so.1 /rootfs/usr/local/lib/containers/nvidia-fabricmanager/opt/nvidia/nvlsm/lib/ +finalize: + - from: /rootfs + to: /rootfs diff --git a/nvidia-gpu/nvidia-fabricmanager/nvidia-fabricmanager-gcc-runtime/vars.yaml b/nvidia-gpu/nvidia-fabricmanager/nvidia-fabricmanager-gcc-runtime/vars.yaml new file mode 100644 index 00000000..f2bd455a --- /dev/null +++ b/nvidia-gpu/nvidia-fabricmanager/nvidia-fabricmanager-gcc-runtime/vars.yaml @@ -0,0 +1 @@ +INTERNAL_PACKAGE: true diff --git a/nvidia-gpu/nvidia-fabricmanager/nvidia-fabricmanager-wrapper/go.mod b/nvidia-gpu/nvidia-fabricmanager/nvidia-fabricmanager-wrapper/go.mod new file mode 100644 index 00000000..dcb534ae --- /dev/null +++ b/nvidia-gpu/nvidia-fabricmanager/nvidia-fabricmanager-wrapper/go.mod @@ -0,0 +1,5 @@ +module nvidia-fabricmanager-wrapper + +go 1.23.0 + +require github.com/goaux/decowriter v1.0.0 diff --git a/nvidia-gpu/nvidia-fabricmanager/nvidia-fabricmanager-wrapper/go.sum b/nvidia-gpu/nvidia-fabricmanager/nvidia-fabricmanager-wrapper/go.sum new file mode 100644 index 00000000..2e1011c1 --- /dev/null +++ b/nvidia-gpu/nvidia-fabricmanager/nvidia-fabricmanager-wrapper/go.sum @@ -0,0 +1,2 @@ +github.com/goaux/decowriter v1.0.0 h1:f1mfBWGFIo3Upev3gswfGLQzQvC4SBVYi2ZAkNZsIaU= +github.com/goaux/decowriter v1.0.0/go.mod h1:8GKUmiBlNCYxVHU2vlZoQHwLvYh7Iw1c7/tRekJbX7o= diff --git a/nvidia-gpu/nvidia-fabricmanager/nvidia-fabricmanager-wrapper/main.go b/nvidia-gpu/nvidia-fabricmanager/nvidia-fabricmanager-wrapper/main.go new file mode 100644 index 00000000..39992ed9 --- /dev/null +++ b/nvidia-gpu/nvidia-fabricmanager/nvidia-fabricmanager-wrapper/main.go @@ -0,0 +1,156 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package main + +import ( + "bufio" + "context" + "fmt" + "log" + "os" + "os/exec" + "os/signal" + "path/filepath" + "strings" + "sync" + "syscall" + "time" + + "github.com/goaux/decowriter" +) + +const ( + // FabricManager + fmCmdFile = "/usr/bin/nv-fabricmanager" + fmConfigFile = "/usr/share/nvidia/nvswitch/fabricmanager.cfg" + fmStopTimeout = 5 * time.Second + + // NVLSM + smCmdFile = "/opt/nvidia/nvlsm/sbin/nvlsm" + smConfigFile = "/usr/share/nvidia/nvlsm/nvlsm.conf" + smPidFile = "/var/run/nvlsm.pid" + smSocket = "/var/run/nvidia-fabricmanager/fm_sm_ipc.socket" + smStopTimeout = 5 * time.Second + smSocketWait = 15 * time.Second +) + +func runCommand(ctx context.Context, wg *sync.WaitGroup, doneCb func(), waitDelay time.Duration, path string, arg ...string) { + wg.Add(1) + + cmd := exec.CommandContext(ctx, path, arg...) + cmd.WaitDelay = waitDelay + cmd.Cancel = func() error { + return cmd.Process.Signal(os.Interrupt) + } + + // TODO line writer to log module + name := filepath.Base(path) + cmd.Stdout = decowriter.New(bufio.NewWriter(os.Stdout), []byte(name+": "), []byte{}) + cmd.Stderr = decowriter.New(bufio.NewWriter(os.Stderr), []byte(name+": "), []byte{}) + + go func() { + log.Printf("nvidia-fabricmanager-wrapper: running command: %s %s\n", path, strings.Join(arg, " ")) + + err := cmd.Run() + if err == nil { + log.Printf("nvidia-fabricmanager-wrapper: command %s [%d] completed successfully\n", path, cmd.Process.Pid) + } else if exitErr, ok := err.(*exec.ExitError); ok { + if exitErr.Exited() { + log.Printf("nvidia-fabricmanager-wrapper: command %s [%d] exited with code %d\n", path, exitErr.Pid(), + exitErr.ExitCode()) + } else { + log.Printf("nvidia-fabricmanager-wrapper: command %s [%d] was terminated\n", path, exitErr.Pid()) + } + } else { + log.Printf("nvidia-fabricmanager-wrapper: failed to run command %s: %v\n", path, err) + } + + wg.Done() + doneCb() + }() +} + +func waitForFile(ctx context.Context, filepath string, timeout time.Duration) error { + timer := time.NewTimer(timeout) + defer timer.Stop() + + for { + select { + case <-ctx.Done(): + return fmt.Errorf("parent context canceled: %w", ctx.Err()) + case <-timer.C: + return fmt.Errorf("timeout waiting for file") + default: + if _, err := os.Stat(filepath); err == nil { + return nil + } + time.Sleep(100 * time.Millisecond) + } + } +} + +func main() { + var cmdWg sync.WaitGroup + + signal.Ignore(syscall.SIGHUP) + + runCtx, gracefulShutdown := context.WithCancel(context.Background()) + + signalsChan := make(chan os.Signal, 1) + signal.Notify(signalsChan, os.Interrupt) + signal.Notify(signalsChan, syscall.SIGTERM) + + go func() { + received := <-signalsChan + signal.Stop(signalsChan) + log.Printf("nvidia-fabricmanager-wrapper: received signal '%s', initiating a graceful shutdown\n", received.String()) + gracefulShutdown() + }() + + nvswitchPorts := findNvswitchMgmtPorts() + for _, port := range nvswitchPorts { + log.Printf("nvidia-fabricmanager-wrapper: found NVSwitch LPF: device=%s guid=0x%x\n", port.IBDevice, port.PortGUID) + } + + fmSmMgmtPortGUID := "" + if len(nvswitchPorts) > 0 { + fmSmMgmtPortGUID = fmt.Sprintf("0x%x", nvswitchPorts[0].PortGUID) + log.Printf("nvidia-fabricmanager-wrapper: using NVSwitch management port GUID: %s\n", fmSmMgmtPortGUID) + } else { + log.Println("nvidia-fabricmanager-wrapper: No InfiniBand NVSwitch detected. On Blackwell HGX baseboards and newer", + "with NVLink 5.0+, please load kernel module 'ib_umad' for NVLSM to run along FabricManager. Otherwise it will", + "fail to start with error NV_WARN_NOTHING_TO_DO, and GPU workloads will report CUDA_ERROR_SYSTEM_NOT_READY.") + } + + if fmSmMgmtPortGUID != "" { + if err := os.Mkdir(filepath.Dir(smSocket), 0755); err != nil { + log.Printf("nvidia-fabricmanager-wrapper: error creating socket directory: %v\n", err) + } + + runCommand(runCtx, &cmdWg, gracefulShutdown, smStopTimeout, smCmdFile, "--config", smConfigFile, + "--guid", fmSmMgmtPortGUID, "--pid_file", smPidFile, "--log_file", "stdout") + + // vendor startup script waits for 5 seconds for NVLSM socket to be available before starting FM + // let's wait for the actual GRPC socket to be created by the plugin + log.Println("nvidia-fabricmanager-wrapper: waiting for socket creation at", smSocket) + err := waitForFile(runCtx, smSocket, smSocketWait) + if err != nil { + log.Printf("nvidia-fabricmanager-wrapper: error waiting for socket: %v\n", err) + } else { + log.Println("nvidia-fabricmanager-wrapper: socket found at", smSocket) + } + // for safety + time.Sleep(time.Second) + } + + fmCmdArgs := []string{"--config", fmConfigFile} + if fmSmMgmtPortGUID != "" { + fmCmdArgs = append(fmCmdArgs, "--fm-sm-mgmt-port-guid", fmSmMgmtPortGUID) + } + runCommand(runCtx, &cmdWg, gracefulShutdown, fmStopTimeout, fmCmdFile, fmCmdArgs...) + + log.Println("nvidia-fabricmanager-wrapper: initialization completed") + cmdWg.Wait() +} diff --git a/nvidia-gpu/nvidia-fabricmanager/nvidia-fabricmanager-wrapper/nvswitch.go b/nvidia-gpu/nvidia-fabricmanager/nvidia-fabricmanager-wrapper/nvswitch.go new file mode 100644 index 00000000..1351e748 --- /dev/null +++ b/nvidia-gpu/nvidia-fabricmanager/nvidia-fabricmanager-wrapper/nvswitch.go @@ -0,0 +1,106 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package main + +// #cgo CFLAGS: -I./rdma-core/include/ +// #cgo LDFLAGS: -L./rdma-core/lib/statics/ -l:libibumad.a +// #include /* __be64 */ +// #include +import "C" +import ( + "bytes" + "encoding/binary" + "os" + "path" + "unsafe" +) + +type NVSwitchMgmtPort struct { + IBDevice string + PortGUID uint64 +} + +/* +Find InfiniBand devices with the capability to configure NVSwitches. +--- +From: https://docs.nvidia.com/datacenter/tesla/pdf/fabric-manager-user-guide.pdf + +The CX7 bridge device is integrated into the GPU baseboard, which includes two physical ports. Each port exposes one +physical function (FC PF) and one Limited physical function (LPF) to the host system, which totals four PFs. The PFs +are categorized into the following PFs: + - Limited PFs (LPF) are designated for specific tasks in the system. + They are used by the FM and the NVLSM to configure and set up NVSwitches, GPU, and NVLink routing information. + LPFs are also used by telemetry agents, such as NVIBDM and DCGM, to monitor and collect data. Resetting this + PF with FLR also resets the corresponding NVSwitch device. + +To differentiate between LPFs and FC PFs, the LPF VPD information includes a vendor-specific field called SMDL, with +a non-zero value defined as SW_MNG. For bare-metal, full pass-through, and shared NVSwitch deployments, the prelaunch +script in the FM service unit file will run and query the available CX7 devices for this VPD information. The file +populates the required FM and NVLSM configuration values so that these communication entities can access the relevant +devices. +*/ +func findLpfDevices() (devices []string) { + const ibPath = "/sys/class/infiniband" + + devDir, err := os.ReadDir(ibPath) + if err != nil { + return + } + + for _, device := range devDir { + vpd, err := os.ReadFile(path.Join(ibPath, device.Name(), "device/vpd")) + if err != nil { + continue + } + + if bytes.Contains(vpd, []byte("SMDL=SW_MNG")) { + devices = append(devices, device.Name()) + } + } + return +} + +func findNvswitchMgmtPorts() (ports []NVSwitchMgmtPort) { + lpfDevs := findLpfDevices() + if len(lpfDevs) == 0 { + return + } + + if C.umad_init() < 0 { + return + } + + for _, lpf := range lpfDevs { + const maxPorts = 16 + var portGUIDs [maxPorts]C.__be64 + + /* + $ man 3 umad_get_ca_portguids + + On success, umad_get_ca_portguids() returns a non-negative value equal to the number of port GUIDs actually + filled. Not all filled entries may be valid. Invalid entries will be 0. For example, on a CA node with only + one port, this function returns a value of 2. In this case, the value at index 0 will be invalid as it is + reserved for switches. On failure, a negative value is returned. + */ + numPort := C.umad_get_ca_portguids(C.CString(lpf), &portGUIDs[0], maxPorts) + + for i := range int(numPort) { + var guid uint64 + + // convert kernel __be64 to uint64 + buf := bytes.NewReader((*[8]byte)(unsafe.Pointer(&portGUIDs[i]))[:]) + if err := binary.Read(buf, binary.BigEndian, &guid); err != nil { + continue + } + + if guid != 0 { + ports = append(ports, NVSwitchMgmtPort{lpf, guid}) + } + } + } + + C.umad_done() + return +} diff --git a/nvidia-gpu/nvidia-fabricmanager/nvidia-fabricmanager-wrapper/pkg.yaml b/nvidia-gpu/nvidia-fabricmanager/nvidia-fabricmanager-wrapper/pkg.yaml new file mode 100644 index 00000000..7aae9ddc --- /dev/null +++ b/nvidia-gpu/nvidia-fabricmanager/nvidia-fabricmanager-wrapper/pkg.yaml @@ -0,0 +1,55 @@ +name: nvidia-fabricmanager-wrapper +variant: scratch +shell: /bin/bash +dependencies: + - image: cgr.dev/chainguard/wolfi-base@{{ .WOLFI_BASE_REF }} +install: + - bash + - build-base + - gcc-14 + - cmake + - ninja + - python3 + - libnl3-dev + - go-1.24 +steps: + - sources: + - url: https://github.com/linux-rdma/rdma-core/releases/download/v{{ .RDMA_CORE_VERSION }}/rdma-core-{{ .RDMA_CORE_VERSION }}.tar.gz + destination: rdma-core.tar.gz + sha256: {{ .RDMA_CORE_SHA256 }} + sha512: {{ .RDMA_CORE_SHA512 }} + - env: + CC: gcc-14 + CXX: g++-14 + GOPATH: /tmp/go + - cachePaths: + - /.cache/go-build + - /tmp/go/pkg + - network: default + prepare: + - | + mkdir rdma-core + tar -xzf rdma-core.tar.gz --strip-components=1 -C rdma-core + - | + cp -r /pkg/* . + - | + go mod download + - network: none + build: + - | + cd rdma-core + cmake -GNinja \ + -DENABLE_STATIC=1 \ + -DNO_MAN_PAGES=1 \ + -DNO_PYVERBS=1 \ + -DCMAKE_BUILD_TYPE=Release + ninja lib/statics/libibumad.a + - | + CGO_ENABLED=1 go build -o nvidia-fabricmanager-wrapper nvidia-fabricmanager-wrapper + install: + - | + mkdir -p /rootfs/usr/local/lib/containers/nvidia-fabricmanager/usr/bin + cp nvidia-fabricmanager-wrapper /rootfs/usr/local/lib/containers/nvidia-fabricmanager/usr/bin/ +finalize: + - from: /rootfs + to: /rootfs diff --git a/nvidia-gpu/nvidia-fabricmanager/nvidia-fabricmanager-wrapper/vars.yaml b/nvidia-gpu/nvidia-fabricmanager/nvidia-fabricmanager-wrapper/vars.yaml new file mode 100644 index 00000000..f2bd455a --- /dev/null +++ b/nvidia-gpu/nvidia-fabricmanager/nvidia-fabricmanager-wrapper/vars.yaml @@ -0,0 +1 @@ +INTERNAL_PACKAGE: true diff --git a/nvidia-gpu/nvidia-fabricmanager/production/nvidia-fabricmanager.yaml b/nvidia-gpu/nvidia-fabricmanager/production/nvidia-fabricmanager.yaml index f252c5af..36435196 100644 --- a/nvidia-gpu/nvidia-fabricmanager/production/nvidia-fabricmanager.yaml +++ b/nvidia-gpu/nvidia-fabricmanager/production/nvidia-fabricmanager.yaml @@ -1,10 +1,7 @@ # https://docs.nvidia.com/datacenter/tesla/pdf/fabric-manager-user-guide.pdf name: nvidia-fabricmanager container: - entrypoint: /usr/local/bin/nv-fabricmanager - args: - - --config - - /usr/local/share/nvidia/nvswitch/fabricmanager.cfg + entrypoint: /usr/bin/nvidia-fabricmanager-wrapper mounts: # device files - source: /dev @@ -28,44 +25,40 @@ container: options: - bind - ro - # nvidia libraries - - source: /usr/local/lib - destination: /usr/local/lib - type: bind - options: - - bind - - ro # service state file + # * nvlsm: + # - pid file that can't be disabled + # - unix socket /var/run/nvidia-fabricmanager/fm_sm_ipc.socket + # can't be changed, path is hardcoded into fabricmanager + # * fabricmanager + # - state file + # - database files - source: /var/run/nvidia-fabricmanager - destination: /var/run/nvidia-fabricmanager + destination: /var/run type: bind options: - rshared - rbind - rw - # log files - - source: /var/log - destination: /var/log + # service cache file + # * nvlsm: database files + - source: /var/cache/nvidia-fabricmanager + destination: /var/cache type: bind options: - rshared - rbind - rw - # fabric topology files - - source: /usr/local/share/nvidia/nvswitch - destination: /usr/local/share/nvidia/nvswitch + # service log files + # * nvlsm: + # - mandatory dump files hardcoded to /var/log/, so /var/log must be writable + - source: /var/log/nvidia-fabricmanager + destination: /var/log type: bind options: - rshared - rbind - - ro - # binaries - - source: /usr/local/bin - destination: /usr/local/bin - type: bind - options: - - bind - - ro + - rw depends: - service: cri # we need to depend on udevd so that the nvidia device files are created diff --git a/nvidia-gpu/nvidia-fabricmanager/production/pkg.yaml b/nvidia-gpu/nvidia-fabricmanager/production/pkg.yaml index d5988587..9d9e9682 100644 --- a/nvidia-gpu/nvidia-fabricmanager/production/pkg.yaml +++ b/nvidia-gpu/nvidia-fabricmanager/production/pkg.yaml @@ -2,7 +2,9 @@ name: nvidia-fabricmanager-production variant: scratch shell: /bin/bash dependencies: - - stage: base + - stage: base + - stage: nvidia-fabricmanager-gcc-runtime + - stage: nvidia-fabricmanager-wrapper steps: - sources: # {{ if eq .ARCH "aarch64" }} This in fact is YAML comment, but Go templating instruction is evaluated by bldr @@ -10,40 +12,69 @@ steps: destination: fabricmanager.tar.xz sha256: {{ .NVIDIA_FABRIC_MANAGER_PRODUCTION_ARM64_SHA256 }} sha512: {{ .NVIDIA_FABRIC_MANAGER_PRODUCTION_ARM64_SHA512 }} + - url: https://developer.download.nvidia.com/compute/cuda/redist/nvlsm/linux-sbsa/nvlsm-linux-sbsa-{{ .NVIDIA_NVLSM_PRODUCTION_VERSION }}-archive.tar.xz + destination: nvlsm.tar.xz + sha256: {{ .NVIDIA_NVLSM_PRODUCTION_ARM64_SHA256 }} + sha512: {{ .NVIDIA_NVLSM_PRODUCTION_ARM64_SHA512 }} # {{ else }} This in fact is YAML comment, but Go templating instruction is evaluated by bldr - url: https://developer.download.nvidia.com/compute/nvidia-driver/redist/fabricmanager/linux-x86_64/fabricmanager-linux-x86_64-{{ .NVIDIA_DRIVER_PRODUCTION_VERSION }}-archive.tar.xz destination: fabricmanager.tar.xz sha256: {{ .NVIDIA_FABRIC_MANAGER_PRODUCTION_AMD64_SHA256 }} sha512: {{ .NVIDIA_FABRIC_MANAGER_PRODUCTION_AMD64_SHA512 }} + - url: https://developer.download.nvidia.com/compute/cuda/redist/nvlsm/linux-x86_64/nvlsm-linux-x86_64-{{ .NVIDIA_NVLSM_PRODUCTION_VERSION }}-archive.tar.xz + destination: nvlsm.tar.xz + sha256: {{ .NVIDIA_NVLSM_PRODUCTION_AMD64_SHA256 }} + sha512: {{ .NVIDIA_NVLSM_PRODUCTION_AMD64_SHA512 }} # {{ end }} This in fact is YAML comment, but Go templating instruction is evaluated by bldr prepare: - | - tar -xf fabricmanager.tar.xz --strip-components=1 + mkdir fm sm + tar -xf fabricmanager.tar.xz --strip-components=1 -C fm + tar -xf nvlsm.tar.xz --strip-components=1 -C sm install: - | - mkdir -p /rootfs/usr/local/bin \ - /rootfs/usr/local/lib \ - /rootfs/usr/local/share/nvidia/nvswitch \ - /rootfs/usr/local/lib/containers/nvidia-fabricmanager \ - /rootfs/usr/local/etc/containers + mkdir -p /rootfs/usr/local/lib/containers/nvidia-fabricmanager/usr/bin \ + /rootfs/usr/local/lib/containers/nvidia-fabricmanager/usr/lib \ + /rootfs/usr/local/lib/containers/nvidia-fabricmanager/usr/share/nvidia/nvswitch \ + /rootfs/usr/local/lib/containers/nvidia-fabricmanager/opt/nvidia/nvlsm/sbin \ + /rootfs/usr/local/lib/containers/nvidia-fabricmanager/opt/nvidia/nvlsm/lib \ + /rootfs/usr/local/lib/containers/nvidia-fabricmanager/usr/share/nvidia/nvlsm + # nvlsm + - | + cp sm/sbin/nvlsm /rootfs/usr/local/lib/containers/nvidia-fabricmanager/opt/nvidia/nvlsm/sbin/ - cp lib/libnvfm.so.1 /rootfs/usr/local/lib/libnvfm.so.1 - ln -s libnvfm.so.1 /rootfs/usr/local/lib/libnvfm.so + cp sm/lib/libgrpc_mgr.so /rootfs/usr/local/lib/containers/nvidia-fabricmanager/opt/nvidia/nvlsm/lib/ - cp bin/nv-fabricmanager /rootfs/usr/local/bin/ - cp bin/nvswitch-audit /rootfs/usr/local/bin/ + cp sm/share/nvidia/nvlsm/device_configuration.conf \ + sm/share/nvidia/nvlsm/grpc_mgr.conf \ + sm/share/nvidia/nvlsm/nvlsm.conf \ + /rootfs/usr/local/lib/containers/nvidia-fabricmanager/usr/share/nvidia/nvlsm/ + # fabricmanager + - | + cp fm/bin/nv-fabricmanager \ + fm/bin/nvswitch-audit \ + /rootfs/usr/local/lib/containers/nvidia-fabricmanager/usr/bin/ - cp share/nvidia/nvswitch/* /rootfs/usr/local/share/nvidia/nvswitch/ + cp fm/lib/libnvfm.so.1 /rootfs/usr/local/lib/containers/nvidia-fabricmanager/usr/lib/ + ln -s libnvfm.so.1 /rootfs/usr/local/lib/containers/nvidia-fabricmanager/usr/lib/libnvfm.so - cp etc/fabricmanager.cfg /rootfs/usr/local/share/nvidia/nvswitch/ + cp fm/share/nvidia/nvswitch/* \ + fm/etc/fabricmanager.cfg \ + /rootfs/usr/local/lib/containers/nvidia-fabricmanager/usr/share/nvidia/nvswitch/ - cp /pkg/nvidia-fabricmanager.yaml /rootfs/usr/local/etc/containers/nvidia-fabricmanager.yaml + sed -i 's/DAEMONIZE=.*/DAEMONIZE=0/g' /rootfs/usr/local/lib/containers/nvidia-fabricmanager/usr/share/nvidia/nvswitch/fabricmanager.cfg + sed -i 's/LOG_FILE_NAME=.*/LOG_FILE_NAME=/g' /rootfs/usr/local/lib/containers/nvidia-fabricmanager/usr/share/nvidia/nvswitch/fabricmanager.cfg + sed -i 's#STATE_FILE_NAME=.*#STATE_FILE_NAME=/var/run/fabricmanager.state#g' /rootfs/usr/local/lib/containers/nvidia-fabricmanager/usr/share/nvidia/nvswitch/fabricmanager.cfg - echo "FABRIC_NODE_CONFIG_FILE=/usr/local/share/nvidia/nvswitch/fabricmanager.cfg" >> /rootfs/usr/local/share/nvidia/nvswitch/fabricmanager.cfg # fix for #511 - sed -i 's/DAEMONIZE=.*/DAEMONIZE=0/g' /rootfs/usr/local/share/nvidia/nvswitch/fabricmanager.cfg - sed -i 's/STATE_FILE_NAME=.*/STATE_FILE_NAME=\/var\/run\/nvidia-fabricmanager\/fabricmanager.state/g' /rootfs/usr/local/share/nvidia/nvswitch/fabricmanager.cfg - sed -i 's/TOPOLOGY_FILE_PATH=.*/TOPOLOGY_FILE_PATH=\/usr\/local\/share\/nvidia\/nvswitch/g' /rootfs/usr/local/share/nvidia/nvswitch/fabricmanager.cfg - sed -i 's/DATABASE_PATH=.*/DATABASE_PATH=\/usr\/local\/share\/nvidia\/nvswitch/g' /rootfs/usr/local/share/nvidia/nvswitch/fabricmanager.cfg + if grep -q '^DATABASE_PATH=' /rootfs/usr/local/lib/containers/nvidia-fabricmanager/usr/share/nvidia/nvswitch/fabricmanager.cfg + then + sed -i 's#DATABASE_PATH=.*#DATABASE_PATH=/var/run#g' /rootfs/usr/local/lib/containers/nvidia-fabricmanager/usr/share/nvidia/nvswitch/fabricmanager.cfg + else + echo -e '\nDATABASE_PATH=/var/run\n' >>/rootfs/usr/local/lib/containers/nvidia-fabricmanager/usr/share/nvidia/nvswitch/fabricmanager.cfg + fi + - | + mkdir -p /rootfs/usr/local/etc/containers + cp /pkg/nvidia-fabricmanager.yaml /rootfs/usr/local/etc/containers/nvidia-fabricmanager.yaml test: - | mkdir -p /extensions-validator-rootfs diff --git a/nvidia-gpu/vars.yaml b/nvidia-gpu/vars.yaml index 56c4fed5..eb3cfb87 100644 --- a/nvidia-gpu/vars.yaml +++ b/nvidia-gpu/vars.yaml @@ -9,6 +9,11 @@ NVIDIA_FABRIC_MANAGER_LTS_ARM64_SHA256: ea91191e91b306da1ee2932da399fab8fe46395e NVIDIA_FABRIC_MANAGER_LTS_ARM64_SHA512: 3f9ebf777dda108822e1f514d7b3b088faff3a07ce22c93118acbe3fc4bcb40791ff5692b7bf939253eab97aa03a6be26ce01d4407c7ced1a0231073cb308058 NVIDIA_FABRIC_MANAGER_LTS_AMD64_SHA256: f0220bfb67d04b4107acf00cc95abe5a9268fd8f8b5bae26971f4df232e4369c NVIDIA_FABRIC_MANAGER_LTS_AMD64_SHA512: 91b087fba76e0edd1dd35a0706467b05a483a7679a0415b6d6ab33b754a8b99ed59418906344c01eaef15c31f37afeb8fbaccac364f140c5a913067b463062f2 +NVIDIA_NVLSM_LTS_VERSION: 2025.06.5 +NVIDIA_NVLSM_LTS_ARM64_SHA512: d83322ea43748b5f19d1ea011ad8dde19218d80ff507255a1884bc2b600d875dd3cfe737b29f4edfc96065c2761e4a2eda110c25a62d7cd2528b9b1a5e24d457 +NVIDIA_NVLSM_LTS_ARM64_SHA256: 4f25e213c87acb9c0ac75ec0d84d7146af310170e0cd187ffc744c9d1d3a657d +NVIDIA_NVLSM_LTS_AMD64_SHA512: fe690537f9de77bf26d8c2982b91c16121ade9b1877b2dda9e7b5b3fed0841e5fa7c22bb30767b5bd45f27176346610b4eda7984b920e80474eaa81961625198 +NVIDIA_NVLSM_LTS_AMD64_SHA256: 4130187cb503805eb3c4d349513495bdb3c53e503a5b737b5d6053f623989c70 # renovate: datasource=github-releases extractVersion=^\d+\.(?\d+\.\d+)$ depName=nvidia/open-gpu-kernel-modules NVIDIA_DRIVER_PRODUCTION_VERSION: 570.195.03 NVIDIA_PKGS_PRODUCTION_ARM64_SHA256: fde2e6e02d7087d97bbeb9dda8d95ea668923c17b6926d56485a56a3989f6a58 @@ -19,6 +24,11 @@ NVIDIA_FABRIC_MANAGER_PRODUCTION_ARM64_SHA256: 0d3ff16fb5308345520cecb9cef3603f7 NVIDIA_FABRIC_MANAGER_PRODUCTION_ARM64_SHA512: 1a3d7fcebe04ebfb2e3b5be7a33c74f3ad90eefd6b8f73f59c0f8ecde91790b73e7632443a7a51dd467e332da1d5bb37d4b0bde4a590711fe5796f3c7a76c9a9 NVIDIA_FABRIC_MANAGER_PRODUCTION_AMD64_SHA256: 69d61798cf689f1c016f3fd6e3ea87fc467ee1d4f93453e3a3ce65f8a0250155 NVIDIA_FABRIC_MANAGER_PRODUCTION_AMD64_SHA512: 7fc686cda88438c5ef22c438448098553bd6759b4c0d75cea31664c42100c81f647808a7bb4883565a0e59be64d2ee7e50b70ca7da5336488b682913680a1f1c +NVIDIA_NVLSM_PRODUCTION_VERSION: 2025.03.1 +NVIDIA_NVLSM_PRODUCTION_ARM64_SHA512: 63584405c5de879a91dbb4a14c01fac2661cea646bc0d67d535e11aee62187f48c10bb3e7ed159daffb19b444bf42138ee30a123e926de4982624883efb02907 +NVIDIA_NVLSM_PRODUCTION_ARM64_SHA256: e260285ec01c6beb562a14625e9564b96374bb824ec62cc9866066a48710fa54 +NVIDIA_NVLSM_PRODUCTION_AMD64_SHA512: bd9a4cf7759d8fed7eb7a544063aa0c492db9145b491330f92a8a8a9247cdd7828e6e278f489029110fc98c05be25dcb3fc39d49add830858bfb81376d80f077 +NVIDIA_NVLSM_PRODUCTION_AMD64_SHA256: b034dad10a3154359e244b85206cd73f0fbce8e1cdf76058417b7b562c337388 # renovate: datasource=github-releases depName=nvidia/nvidia-container-toolkit CONTAINER_TOOLKIT_VERSION: v1.17.8 CONTAINER_TOOLKIT_REF: f202b80a9b9d0db00d9b1d73c0128c8962c55f4d @@ -43,3 +53,7 @@ LIBCAP_SHA512: f9448628ce036a10ce71958b25e5dd31032c4d86d8d34d905d2dfa32890ad4438 ELFUTILS_VERSION: 0.193 ELFUTILS_SHA256: 7857f44b624f4d8d421df851aaae7b1402cfe6bcdd2d8049f15fc07d3dde7635 ELFUTILS_SHA512: 557e328e3de0d2a69d09c15a9333f705f3233584e2c6a7d3ce855d06a12dc129e69168d6be64082803630397bd64e1660a8b5324d4f162d17922e10ddb367d76 +# renovate: datasource=github-releases extractVersion=^v(?.*)$ depName=linux-rdma/rdma-core +RDMA_CORE_VERSION: 57.0 +RDMA_CORE_SHA256: 5f94c463c931e4a9273f366ca7cb446b54d8bd4732288ade04679886be06862d +RDMA_CORE_SHA512: 4a904d34af6863655545fe720cc25a8800684f63c51cebb67be2058363949217903957dc925c69d41294362ccff75fb0d37f3bc31cd6f6f252a804d6713f62cf