Skip to content

Commit 07e9b55

Browse files
committed
tests/ai-conformance: use nvidia GPU operator to install GPU drivers
1 parent a176ffe commit 07e9b55

File tree

1 file changed

+13
-23
lines changed

1 file changed

+13
-23
lines changed

tests/e2e/scenarios/ai-conformance/run-test.sh

Lines changed: 13 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -64,13 +64,12 @@ kops-acquire-latest
6464

6565
# Cluster Configuration
6666
# - Networking: Cilium with Gateway API enabled
67-
# - Nodes: c5.large
68-
# - Runtime: NVIDIA enabled
67+
# - Nodes: c5.large (we need some non-GPU nodes for non-GPU workloads)
68+
# - NVIDIA driver and runtime are managed by GPU Operator (not kOps)
6969
OVERRIDES="${OVERRIDES-} --networking=cilium"
7070
OVERRIDES="${OVERRIDES} --set=cluster.spec.networking.cilium.gatewayAPI.enabled=true"
7171
OVERRIDES="${OVERRIDES} --node-size=c5.large"
7272
OVERRIDES="${OVERRIDES} --node-count=2"
73-
OVERRIDES="${OVERRIDES} --set=cluster.spec.containerd.nvidiaGPU.enabled=true"
7473

7574
kops-up
7675

@@ -120,30 +119,19 @@ kubectl apply --server-side -f https://github.com/cert-manager/cert-manager/rele
120119
# kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/refs/tags/v0.17.0/deployments/static/nvidia-device-plugin.yml
121120

122121
# NVIDIA GPU Operator
122+
# Manages the full NVIDIA stack: kernel driver, container toolkit, device plugin.
123+
# The driver is installed into /run/nvidia/driver on each node.
123124
helm upgrade -i nvidia-gpu-operator --wait \
124125
-n gpu-operator --create-namespace \
125126
nvidia/gpu-operator \
126-
--version=v25.10.1
127+
--version=v25.10.1 \
128+
--wait
127129

128-
# 1.5 NVIDIA DRA Driver
129130

131+
# NVIDIA DRA Driver
132+
# Uses the driver installed by GPU Operator at /run/nvidia/driver (the default).
130133
echo "Installing NVIDIA DRA Driver..."
131134

132-
# cat > values.yaml <<EOF
133-
# kubeletPlugins:
134-
# nodeSelector:
135-
# feature.node.kubernetes.io/pci-0302_10de.present: "true"
136-
# EOF
137-
138-
# helm upgrade -i nvidia-dra-driver-gpu nvidia/nvidia-dra-driver-gpu \
139-
# --create-namespace \
140-
# --namespace nvidia-dra-driver-gpu \
141-
# --version 25.12.0 \
142-
# --set resources.gpus.enabled=true \
143-
# --set gpuResourcesEnabledOverride=true \
144-
# -f values.yaml \
145-
# --wait
146-
147135
cat > values.yaml <<EOF
148136
# The driver daemonset needs a toleration for the nvidia.com/gpu taint
149137
kubeletPlugin:
@@ -182,9 +170,11 @@ echo "----------------------------------------------------------------"
182170
# Wait for kOps validation
183171
"${KOPS}" validate cluster --wait=15m
184172

185-
# Verify Components
186-
echo "Verifying NVIDIA Device Plugin..."
187-
#kubectl rollout status daemonset -n kube-system nvidia-device-plugin-daemonset --timeout=5m || echo "Warning: NVIDIA Device Plugin not ready yet"
173+
echo "Verifying GPU Operator driver..."
174+
kubectl rollout status daemonset -n gpu-operator nvidia-driver-daemonset --timeout=5m || echo "Warning: GPU Operator driver daemonset not ready yet"
175+
176+
echo "Verifying GPU Operator device plugin..."
177+
kubectl rollout status daemonset -n gpu-operator nvidia-device-plugin-daemonset --timeout=5m || echo "Warning: GPU Operator device plugin not ready yet"
188178

189179
echo "Verifying Kueue..."
190180
kubectl rollout status deployment -n kueue-system kueue-controller-manager --timeout=5m || echo "Warning: Kueue not ready yet"

0 commit comments

Comments
 (0)