Skip to content

Commit 7725931

Browse files
authored
Merge pull request #18074 from ameukam/ai-conformance-validate-runtime
[aiconformance]: Add driver_runtime_management validator
2 parents e250338 + 54c6506 commit 7725931

File tree

2 files changed

+167
-0
lines changed

2 files changed

+167
-0
lines changed
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
/*
2+
Copyright The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package driver_runtime_management
18+
19+
import (
20+
"fmt"
21+
"strings"
22+
"testing"
23+
24+
"k8s.io/kops/tests/e2e/scenarios/ai-conformance/validators"
25+
"k8s.io/kops/tests/e2e/scenarios/ai-conformance/validators/kubeobjects"
26+
)
27+
28+
// TestAcceleratorsDriverRuntimeManagement corresponds to the accelerators/driver_runtime_management scenario.
29+
// This test verifies that compatible accelerator drivers and corresponding container runtime configurations
30+
// are correctly installed and maintained on nodes with accelerators.
31+
func TestAcceleratorsDriverRuntimeManagement(t *testing.T) {
32+
33+
h := validators.NewValidatorHarness(t)
34+
35+
h.Logf("# Driver and Runtime Management Verification")
36+
37+
// Step 1: Identify GPU nodes
38+
h.Logf("## Identifying GPU Nodes")
39+
result := h.ShellExec("kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{\"\\t\"}{.status.capacity.nvidia\\.com/gpu}{\"\\n\"}{end}'")
40+
gpuNodesOutput := result.Stdout()
41+
42+
var gpuNodes []string
43+
for _, line := range strings.Split(strings.TrimSpace(gpuNodesOutput), "\n") {
44+
if line == "" {
45+
continue
46+
}
47+
fields := strings.Fields(line)
48+
if len(fields) >= 2 && fields[1] != "" && fields[1] != "<none>" {
49+
nodeName := fields[0]
50+
gpuCount := fields[1]
51+
h.Logf("* Node %s has %s NVIDIA GPUs", nodeName, gpuCount)
52+
gpuNodes = append(gpuNodes, nodeName)
53+
}
54+
}
55+
56+
if len(gpuNodes) == 0 {
57+
h.Skip("No GPU nodes found in cluster; skipping driver runtime management test")
58+
return
59+
}
60+
61+
h.Success("Found %d GPU node(s): %s", len(gpuNodes), strings.Join(gpuNodes, ", "))
62+
63+
// Step 2: Verify GPU Operator DaemonSet is deployed and healthy
64+
h.Logf("## Verifying GPU Operator DaemonSet")
65+
daemonSets := h.ListDaemonSets("gpu-operator")
66+
if len(daemonSets) == 0 {
67+
h.Fatalf("No DaemonSets found in gpu-operator namespace. GPU operator may not be installed.")
68+
}
69+
70+
// Look for the nvidia-driver-daemonset (or similar) that manages driver installation
71+
var driverDaemonSet *kubeobjects.DaemonSet
72+
for _, ds := range daemonSets {
73+
h.Logf("* Found DaemonSet: %s (Ready: %d/%d)", ds.Name(), ds.NumberReady(), ds.DesiredNumberScheduled())
74+
// The driver daemonset typically has "driver" in its name
75+
if strings.Contains(ds.Name(), "driver") {
76+
driverDaemonSet = ds
77+
}
78+
}
79+
80+
if driverDaemonSet != nil {
81+
if driverDaemonSet.NumberReady() != driverDaemonSet.DesiredNumberScheduled() {
82+
h.Fatalf("Driver DaemonSet %s is not fully ready: %d/%d pods ready",
83+
driverDaemonSet.Name(), driverDaemonSet.NumberReady(), driverDaemonSet.DesiredNumberScheduled())
84+
}
85+
h.Success("Driver DaemonSet %s is healthy: %d/%d pods ready",
86+
driverDaemonSet.Name(), driverDaemonSet.NumberReady(), driverDaemonSet.DesiredNumberScheduled())
87+
} else {
88+
h.Logf("Warning: No driver-specific DaemonSet found, but GPU operator is deployed")
89+
}
90+
91+
// Step 3: Verify driver installation on GPU nodes using a diagnostic job
92+
h.Logf("## Verifying Driver Installation with Diagnostic Job")
93+
ns := h.TestNamespace()
94+
objects := h.ApplyManifest(ns, "testdata/driver-check.yaml")
95+
96+
// Wait for the job to complete
97+
for _, obj := range objects {
98+
if obj.Kind() == "Job" {
99+
obj.KubectlWait(validators.WithTimeout("5m"))
100+
}
101+
}
102+
103+
// Get the job logs to verify driver check succeeded
104+
logsResult := h.ShellExec(fmt.Sprintf("kubectl logs --namespace %s job/driver-check", ns))
105+
logs := logsResult.Stdout()
106+
107+
// Verify key indicators in the logs
108+
if !strings.Contains(logs, "NVIDIA-SMI") {
109+
h.Fatalf("NVIDIA-SMI not found in driver check output")
110+
}
111+
112+
if !strings.Contains(logs, "driver_version") {
113+
h.Fatalf("driver_version not found in nvidia-smi output")
114+
}
115+
116+
if !strings.Contains(logs, "SUCCESS") {
117+
h.Fatalf("Driver check job did not report success")
118+
}
119+
120+
h.Success("NVIDIA driver and runtime successfully verified on GPU nodes")
121+
122+
// Step 4: Check for DRA integration (future-proofing)
123+
h.Logf("## Checking for DRA Driver/Runtime Version Exposure")
124+
if h.HasDeviceClass("gpu.nvidia.com") {
125+
// Once DRA exposes driver/runtime versions, we should query them here
126+
h.Logf("* DRA DeviceClass 'gpu.nvidia.com' found")
127+
h.Logf("* Note: DRA-based driver version verification not yet implemented in this test")
128+
h.Logf("* Future enhancement: Query driver/runtime versions via DRA APIs")
129+
} else {
130+
h.Logf("* DRA DeviceClass 'gpu.nvidia.com' not found (DRA driver version exposure may not be available yet)")
131+
}
132+
133+
// Record conformance
134+
if h.AllPassed() {
135+
h.RecordConformance("accelerators/driver_runtime_management")
136+
h.Success("Driver Runtime Management conformance test PASSED")
137+
}
138+
}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
apiVersion: batch/v1
2+
kind: Job
3+
metadata:
4+
name: driver-check
5+
spec:
6+
template:
7+
spec:
8+
restartPolicy: Never
9+
containers:
10+
- name: nvidia-smi
11+
image: nvcr.io/nvidia/cuda:12.6.0-base-ubuntu22.04
12+
command:
13+
- /bin/bash
14+
- -c
15+
- |
16+
set -e
17+
echo "=== NVIDIA Driver Check ==="
18+
nvidia-smi --version
19+
echo ""
20+
echo "=== GPU Detection ==="
21+
nvidia-smi -L
22+
echo ""
23+
echo "=== Driver Information ==="
24+
nvidia-smi --query-gpu=driver_version,name,uuid --format=csv
25+
echo ""
26+
echo "SUCCESS: NVIDIA driver and runtime verified"
27+
resources:
28+
limits:
29+
nvidia.com/gpu: 1

0 commit comments

Comments
 (0)