@@ -64,13 +64,12 @@ kops-acquire-latest
6464
6565# Cluster Configuration
6666# - Networking: Cilium with Gateway API enabled
67- # - Nodes: c5.large
68- # - Runtime: NVIDIA enabled
67+ # - Nodes: c5.large (we need some non-GPU nodes for non-GPU workloads)
68+ # - NVIDIA driver and runtime are managed by GPU Operator (not kOps)
6969OVERRIDES=" ${OVERRIDES-} --networking=cilium"
7070OVERRIDES=" ${OVERRIDES} --set=cluster.spec.networking.cilium.gatewayAPI.enabled=true"
7171OVERRIDES=" ${OVERRIDES} --node-size=c5.large"
7272OVERRIDES=" ${OVERRIDES} --node-count=2"
73- OVERRIDES=" ${OVERRIDES} --set=cluster.spec.containerd.nvidiaGPU.enabled=true"
7473
7574kops-up
7675
@@ -120,30 +119,19 @@ kubectl apply --server-side -f https://github.com/cert-manager/cert-manager/rele
120119# kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/refs/tags/v0.17.0/deployments/static/nvidia-device-plugin.yml
121120
122121# NVIDIA GPU Operator
122+ # Manages the full NVIDIA stack: kernel driver, container toolkit, device plugin.
123+ # The driver is installed into /run/nvidia/driver on each node.
123124helm upgrade -i nvidia-gpu-operator --wait \
124125 -n gpu-operator --create-namespace \
125126 nvidia/gpu-operator \
126- --version=v25.10.1
127+ --version=v25.10.1 \
128+ --wait
127129
128- # 1.5 NVIDIA DRA Driver
129130
131+ # NVIDIA DRA Driver
132+ # Uses the driver installed by GPU Operator at /run/nvidia/driver (the default).
130133echo " Installing NVIDIA DRA Driver..."
131134
132- # cat > values.yaml <<EOF
133- # kubeletPlugins:
134- # nodeSelector:
135- # feature.node.kubernetes.io/pci-0302_10de.present: "true"
136- # EOF
137-
138- # helm upgrade -i nvidia-dra-driver-gpu nvidia/nvidia-dra-driver-gpu \
139- # --create-namespace \
140- # --namespace nvidia-dra-driver-gpu \
141- # --version 25.12.0 \
142- # --set resources.gpus.enabled=true \
143- # --set gpuResourcesEnabledOverride=true \
144- # -f values.yaml \
145- # --wait
146-
147135cat > values.yaml << EOF
148136# The driver daemonset needs a toleration for the nvidia.com/gpu taint
149137kubeletPlugin:
@@ -182,9 +170,11 @@ echo "----------------------------------------------------------------"
182170# Wait for kOps validation
183171" ${KOPS} " validate cluster --wait=15m
184172
185- # Verify Components
186- echo " Verifying NVIDIA Device Plugin..."
187- # kubectl rollout status daemonset -n kube-system nvidia-device-plugin-daemonset --timeout=5m || echo "Warning: NVIDIA Device Plugin not ready yet"
173+ echo " Verifying GPU Operator driver..."
174+ kubectl rollout status daemonset -n gpu-operator nvidia-driver-daemonset --timeout=5m || echo " Warning: GPU Operator driver daemonset not ready yet"
175+
176+ echo " Verifying GPU Operator device plugin..."
177+ kubectl rollout status daemonset -n gpu-operator nvidia-device-plugin-daemonset --timeout=5m || echo " Warning: GPU Operator device plugin not ready yet"
188178
189179echo " Verifying Kueue..."
190180kubectl rollout status deployment -n kueue-system kueue-controller-manager --timeout=5m || echo " Warning: Kueue not ready yet"
0 commit comments