Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 65 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -356,10 +356,72 @@ You can use the IDs of the GPUs as well as the GPU sharing settings set in
these environment variables to verify that they were handed out in a way
consistent with the semantics shown in the figure above.

### Demo DRA Admin Access Feature
This example driver includes support for the [DRA AdminAccess feature](https://kubernetes.io/docs/concepts/scheduling-eviction/dynamic-resource-allocation/#admin-access), which allows administrators to gain privileged access to devices already in use by other users. In this example, only workloads with admin access resource claims can view sensitive host hardware information through environment variables.

When admin access is granted, the following host information is made available through environment variables:

- `DRA_ADMIN_ACCESS=true` - Indicates admin access is enabled
- `HOST_CPU_INFO` - CPU model and core count information from /proc/cpuinfo
- `HOST_MEMORY_INFO` - Memory capacity and availability from /proc/meminfo
- `HOST_KERNEL_INFO` - Kernel version information from /proc/version
- `HOST_SYSTEM_INFO` - Operating system and architecture (GOOS, GOARCH)
- `HOST_NETWORK_INFO` - Available network interfaces from /proc/net/dev
- `HOST_STORAGE_INFO` - Root filesystem and mount information from /proc/mounts

#### CDI Integration

When admin access is detected, the CDI (Container Device Interface) handler injects the host hardware environment variables into the container specification.

#### Usage Example

See `demo/gpu-test7.yaml` for a complete example. Key points:

1. **Namespace**: Must have admin access label in order to create ResourceClaimTemplate and ResourceClaim with `adminAccess: true`. From kubernetes v1.34+, this label will be `resource.kubernetes.io/admin-access`.
```yaml
apiVersion: v1
kind: Namespace
metadata:
name: gpu-test7
labels:
resource.k8s.io/admin-access: "true"
```

2. **Resource Claim Template**: Request must have `adminAccess: true`. [optional] `allocationMode: All` should be included if the workload needs to get information for all hardware devices.
```yaml
spec:
spec:
devices:
requests:
- name: admin-gpu
exactly:
deviceClassName: gpu.example.com
allocationMode: All
adminAccess: true
```

3. **Container**: Will receive host hardware information via environment variables
```bash
echo "DRA Admin Access: $DRA_ADMIN_ACCESS"
echo "Host CPU Info: $HOST_CPU_INFO"
echo "Host Memory Info: $HOST_MEMORY_INFO"
# ... additional host info variables
```

#### Testing

To run this demo:
```bash
./demo/test-admin-access.sh
```
Note: These workloads can access devices already in use by other workloads. Only these workloads with admin access resource claims can view sensitive host hardware information through environment variables.

### Clean Up

Once you have verified everything is running correctly, delete all of the
example apps:
```bash
kubectl delete --wait=false --filename=demo/gpu-test{1,2,3,4,5}.yaml
kubectl delete --wait=false --filename=demo/gpu-test{1,2,3,4,5,7}.yaml
```

And wait for them to terminate:
Expand All @@ -374,6 +436,8 @@ gpu-test3 pod0 1/1 Terminating 0 31m
gpu-test3 pod1 1/1 Terminating 0 31m
gpu-test4 pod0 1/1 Terminating 0 31m
gpu-test5 pod0 4/4 Terminating 0 31m
gpu-test7 pod0 1/1 Terminating 0 31m
gpu-test7 pod1 1/1 Terminating 0 31m
...
```

Expand Down
23 changes: 20 additions & 3 deletions cmd/dra-example-kubeletplugin/cdi.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,18 +36,20 @@ const (
)

type CDIHandler struct {
cache *cdiapi.Cache
cache *cdiapi.Cache
hostHardwareInfo *HostHardwareInfo
}

func NewCDIHandler(config *Config) (*CDIHandler, error) {
func NewCDIHandler(config *Config, hostHardwareInfo *HostHardwareInfo) (*CDIHandler, error) {
cache, err := cdiapi.NewCache(
cdiapi.WithSpecDirs(config.flags.cdiRoot),
)
if err != nil {
return nil, fmt.Errorf("unable to create a new CDI cache: %w", err)
}
handler := &CDIHandler{
cache: cache,
cache: cache,
hostHardwareInfo: hostHardwareInfo,
}

return handler, nil
Expand Down Expand Up @@ -96,9 +98,24 @@ func (cdi *CDIHandler) CreateClaimSpecFile(claimUID string, devices PreparedDevi
ContainerEdits: &cdispec.ContainerEdits{
Env: []string{
fmt.Sprintf("GPU_DEVICE_%s_RESOURCE_CLAIM=%s", device.DeviceName[4:], claimUID),
fmt.Sprintf("DRA_ADMIN_ACCESS=%t", device.AdminAccess),
},
},
}

// If this device has admin access, inject host hardware information
if device.AdminAccess {
hostEnvVars := []string{
fmt.Sprintf("HOST_CPU_INFO=%s", cdi.hostHardwareInfo.CPUInfo),
fmt.Sprintf("HOST_MEMORY_INFO=%s", cdi.hostHardwareInfo.MemInfo),
fmt.Sprintf("HOST_KERNEL_INFO=%s", cdi.hostHardwareInfo.KernelInfo),
fmt.Sprintf("HOST_SYSTEM_INFO=%s", cdi.hostHardwareInfo.SystemInfo),
fmt.Sprintf("HOST_NETWORK_INFO=%s", cdi.hostHardwareInfo.NetworkInfo),
fmt.Sprintf("HOST_STORAGE_INFO=%s", cdi.hostHardwareInfo.StorageInfo),
}
claimEdits.ContainerEdits.Env = append(claimEdits.ContainerEdits.Env, hostEnvVars...)
}

claimEdits.Append(device.ContainerEdits)

cdiDevice := cdispec.Device{
Expand Down
181 changes: 181 additions & 0 deletions cmd/dra-example-kubeletplugin/discovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,12 @@
package main

import (
"bufio"
"fmt"
"math/rand"
"os"
"runtime"
"strings"

resourceapi "k8s.io/api/resource/v1beta1"
"k8s.io/apimachinery/pkg/api/resource"
Expand Down Expand Up @@ -84,3 +87,181 @@ func hash(s string) int64 {
}
return h
}

// HostHardwareInfo represents hardware information available to admin access.
type HostHardwareInfo struct {
CPUInfo string
MemInfo string
KernelInfo string
SystemInfo string
NetworkInfo string
StorageInfo string
}

// GetHostHardwareInfo gathers hardware information from the host system.
func GetHostHardwareInfo() (*HostHardwareInfo, error) {
info := &HostHardwareInfo{}

// Get CPU information
if cpuInfo, err := readFile("/proc/cpuinfo"); err == nil {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since all the rest of the data coming out of the driver is entirely OS- and hardware-agnostic AFAIK, could we follow that here? Even if admin access is implemented here as simply as adding only DRA_ADMIN_ACCESS=true, I think that's enough to demonstrate the end-to-end flow. And it doesn't assume any details about the underlying OS or hardware, which I think is an important part of making the example driver usable in as many places as possible.

e.g. @lauralorenz just brought up the question of whether the example driver works on Windows. Without this change the answer is "maybe?" but reading /proc would make that a more conclusive "no." I know the driver wouldn't fail and we'd only get the "unavailable" value in that case, but I'd rather avoid any evidence of assumptions like that if possible.

info.CPUInfo = extractCPUModel(cpuInfo)
} else {
info.CPUInfo = fmt.Sprintf("CPU Info unavailable: %v", err)
}

// Get memory information
if memInfo, err := readFile("/proc/meminfo"); err == nil {
info.MemInfo = extractMemInfo(memInfo)
} else {
info.MemInfo = fmt.Sprintf("Memory Info unavailable: %v", err)
}

// Get kernel information
if kernelInfo, err := readFile("/proc/version"); err == nil {
info.KernelInfo = strings.TrimSpace(kernelInfo)
} else {
info.KernelInfo = fmt.Sprintf("Kernel Info unavailable: %v", err)
}

// Get system information (architecture, OS)
info.SystemInfo = fmt.Sprintf("GOOS: %s, GOARCH: %s", runtime.GOOS, runtime.GOARCH)

// Get network information
if netInfo, err := getNetworkInfo(); err == nil {
info.NetworkInfo = netInfo
} else {
info.NetworkInfo = fmt.Sprintf("Network Info unavailable: %v", err)
}

// Get storage information
if storageInfo, err := getStorageInfo(); err == nil {
info.StorageInfo = storageInfo
} else {
info.StorageInfo = fmt.Sprintf("Storage Info unavailable: %v", err)
}

return info, nil
}

func readFile(path string) (string, error) {
data, err := os.ReadFile(path)
if err != nil {
return "", err
}
return string(data), nil
}

func extractCPUModel(cpuInfo string) string {
scanner := bufio.NewScanner(strings.NewReader(cpuInfo))
cpuCount := 0
var modelName string

for scanner.Scan() {
line := scanner.Text()
if strings.HasPrefix(line, "processor") {
cpuCount++
}
if strings.HasPrefix(line, "model name") {
parts := strings.Split(line, ":")
if len(parts) > 1 {
modelName = strings.TrimSpace(parts[1])
}
}
}

if modelName != "" {
return fmt.Sprintf("%d x %s", cpuCount, modelName)
}
return fmt.Sprintf("%d CPU cores", cpuCount)
}

func extractMemInfo(memInfo string) string {
scanner := bufio.NewScanner(strings.NewReader(memInfo))
var totalMem, availableMem string

for scanner.Scan() {
line := scanner.Text()
if strings.HasPrefix(line, "MemTotal:") {
parts := strings.Fields(line)
if len(parts) >= 2 {
totalMem = parts[1] + "kB"
}
}
if strings.HasPrefix(line, "MemAvailable:") {
parts := strings.Fields(line)
if len(parts) >= 2 {
availableMem = parts[1] + "kB"
}
}
}

if totalMem != "" && availableMem != "" {
return fmt.Sprintf("Total: %s, Available: %s", totalMem, availableMem)
}
return "Memory information parsing failed"
}

func getNetworkInfo() (string, error) {
// Read network interfaces
interfaces, err := readFile("/proc/net/dev")
if err != nil {
return "", err
}

scanner := bufio.NewScanner(strings.NewReader(interfaces))
var interfaceNames []string

for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if strings.Contains(line, ":") && !strings.Contains(line, "Inter-|") && !strings.Contains(line, "face |") {
parts := strings.Split(line, ":")
if len(parts) > 0 {
ifName := strings.TrimSpace(parts[0])
if ifName != "lo" { // Skip loopback
interfaceNames = append(interfaceNames, ifName)
}
}
}
}

return fmt.Sprintf("Network Interfaces: %s", strings.Join(interfaceNames, ", ")), nil
}

func getStorageInfo() (string, error) {
// Read mounted filesystems
mounts, err := readFile("/proc/mounts")
if err != nil {
return "", err
}

scanner := bufio.NewScanner(strings.NewReader(mounts))
var rootFS string
var mountCount int

for scanner.Scan() {
line := scanner.Text()
fields := strings.Fields(line)
if len(fields) >= 3 {
mountPoint := fields[1]
fsType := fields[2]

if mountPoint == "/" {
rootFS = fmt.Sprintf("Root FS: %s (%s)", fields[0], fsType)
}

// Count real filesystems (not virtual ones)
if !strings.HasPrefix(fields[0], "/proc") &&
!strings.HasPrefix(fields[0], "/sys") &&
!strings.HasPrefix(fields[0], "/dev/pts") &&
fsType != "tmpfs" && fsType != "devtmpfs" &&
fsType != "cgroup" && fsType != "cgroup2" {
mountCount++
}
}
}

if rootFS != "" {
return fmt.Sprintf("%s, Total mounts: %d", rootFS, mountCount), nil
}
return fmt.Sprintf("Storage mounts: %d", mountCount), nil
}
4 changes: 4 additions & 0 deletions cmd/dra-example-kubeletplugin/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,9 @@ func (d *driver) Shutdown(logger klog.Logger) error {

func (d *driver) PrepareResourceClaims(ctx context.Context, claims []*resourceapi.ResourceClaim) (map[types.UID]kubeletplugin.PrepareResult, error) {
klog.Infof("PrepareResourceClaims is called: number of claims: %d", len(claims))
for i, claim := range claims {
klog.Infof("Claim %d: UID=%s, Namespace=%s, Name=%s", i, claim.UID, claim.Namespace, claim.Name)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we add this log to prepareResourceClaim so we don't need this extra loop?

}
result := make(map[types.UID]kubeletplugin.PrepareResult)

for _, claim := range claims {
Expand All @@ -113,6 +116,7 @@ func (d *driver) PrepareResourceClaims(ctx context.Context, claims []*resourceap
func (d *driver) prepareResourceClaim(_ context.Context, claim *resourceapi.ResourceClaim) kubeletplugin.PrepareResult {
preparedPBs, err := d.state.Prepare(claim)
if err != nil {
klog.Errorf("Error preparing devices for claim %v: %v", claim.UID, err)
return kubeletplugin.PrepareResult{
Err: fmt.Errorf("error preparing devices for claim %v: %w", claim.UID, err),
}
Expand Down
Loading
Loading