@@ -46,7 +46,6 @@ if [[ "${AVAILABILITY}" == "false" ]]; then
4646fi
4747rm -f " ${SCENARIO_ROOT} /tools/check-aws-availability/check-aws-availability"
4848
49-
5049kops-acquire-latest
5150
5251# Cluster Configuration
8584
8685${KOPS} update cluster --name " ${CLUSTER_NAME} " --yes --admin
8786
87+ echo " Listing node with their labels..."
88+ kubectl get nodes --show-labels
89+
8890echo " ----------------------------------------------------------------"
8991echo " Deploying AI Conformance Components"
9092echo " ----------------------------------------------------------------"
@@ -97,18 +99,45 @@ kubectl apply -f https://github.com/kubernetes-sigs/gateway-api/releases/downloa
9799echo " Installing cert-manager..."
98100kubectl apply --server-side -f https://github.com/cert-manager/cert-manager/releases/download/v1.19.2/cert-manager.yaml
99101
100- # Setup helm repo for NVIDIA GPU Operator and DRA Driver
102+ # Setup helm repos for monitoring and NVIDIA components
101103helm repo add nvidia https://helm.ngc.nvidia.com/nvidia
104+ helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
102105helm repo update
103106
107+ # Prometheus Stack (kube-prometheus-stack)
108+ # Provides Prometheus, Grafana, Alertmanager, and ServiceMonitor CRDs
109+ # Must be installed before dcgm-exporter so ServiceMonitor CRDs are available
110+ echo " Installing kube-prometheus-stack..."
111+ helm upgrade -i kube-prometheus-stack \
112+ oci://ghcr.io/prometheus-community/charts/kube-prometheus-stack \
113+ --namespace monitoring \
114+ --create-namespace \
115+ --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \
116+ --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false \
117+ --set grafana.enabled=false \
118+ --wait
119+
104120# NVIDIA GPU Operator
105- # Manages the full NVIDIA stack: kernel driver, container toolkit, device plugin.
121+ # Manages the full NVIDIA stack: kernel driver, container toolkit, device plugin, DCGM exporter .
106122# The driver is installed into /run/nvidia/driver on each node.
123+ # DCGM exporter is enabled with ServiceMonitor for Prometheus integration.
124+ echo " Installing NVIDIA GPU Operator with DCGM exporter..."
107125helm upgrade -i nvidia-gpu-operator --wait \
108- -n gpu-operator --create-namespace \
109- nvidia/gpu-operator \
110- --version=v25.10.1 \
111- --wait
126+ -n gpu-operator --create-namespace \
127+ nvidia/gpu-operator \
128+ --version=v25.10.1 \
129+ --set dcgmExporter.enabled=true \
130+ --set dcgmExporter.serviceMonitor.enabled=true \
131+ --set dcgmExporter.serviceMonitor.additionalLabels.release=kube-prometheus-stack
132+
133+ echo " GPU Operator installation complete. Checking component status..."
134+ kubectl get pods -n gpu-operator -o wide || echo " Warning: GPU Operator pods not ready yet"
135+
136+ echo " DCGM Exporter pod status in gpu-operator namespace..."
137+ kubectl get pods -n gpu-operator -l app=nvidia-dcgm-exporter -o wide || echo " Warning: DCGM exporter pods not found"
138+
139+ echo " GPU Operator deployment details for debugging:"
140+ kubectl get all -n gpu-operator || true
112141
113142PATH=" $( pwd) :$PATH "
114143export PATH
@@ -117,7 +146,7 @@ export PATH
117146# Uses the driver installed by GPU Operator at /run/nvidia/driver (the default).
118147echo " Installing NVIDIA DRA Driver..."
119148
120- cat > values.yaml << EOF
149+ cat > values.yaml << EOF
121150# The driver daemonset needs a toleration for the nvidia.com/gpu taint
122151kubeletPlugin:
123152 tolerations:
@@ -127,15 +156,14 @@ kubeletPlugin:
127156EOF
128157
129158helm upgrade -i nvidia-dra-driver-gpu nvidia/nvidia-dra-driver-gpu \
130- --version=" 25.12.0" \
131- --create-namespace \
132- --namespace nvidia-dra-driver-gpu \
133- --set resources.gpus.enabled=true \
134- --set nvidiaDriverRoot=/run/nvidia/driver \
135- --set gpuResourcesEnabledOverride=true \
136- -f values.yaml \
137- --wait
138-
159+ --version=" 25.12.0" \
160+ --create-namespace \
161+ --namespace nvidia-dra-driver-gpu \
162+ --set resources.gpus.enabled=true \
163+ --set nvidiaDriverRoot=/run/nvidia/driver \
164+ --set gpuResourcesEnabledOverride=true \
165+ -f values.yaml \
166+ --wait
139167
140168# KubeRay
141169echo " Installing KubeRay Operator..."
@@ -145,7 +173,6 @@ kubectl apply --server-side -k "github.com/ray-project/kuberay/ray-operator/conf
145173echo " Installing Kueue..."
146174kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.14.8/manifests.yaml
147175
148-
149176echo " ----------------------------------------------------------------"
150177echo " Verifying Cluster and Components"
151178echo " ----------------------------------------------------------------"
@@ -166,6 +193,20 @@ kubectl rollout status deployment -n kuberay-system kuberay-operator --timeout=5
166193echo " Verifying Gateway API..."
167194kubectl get gatewayclass || echo " Warning: GatewayClass not found"
168195
196+ echo " Verifying Prometheus Stack..."
197+ kubectl rollout status deployment -n monitoring kube-prometheus-stack-operator --timeout=5m || echo " Warning: Prometheus Operator not ready yet"
198+ kubectl rollout status statefulset -n monitoring prometheus-kube-prometheus-stack-prometheus --timeout=5m || echo " Warning: Prometheus not ready yet"
199+
200+ echo " Verifying DCGM Exporter in gpu-operator namespace..."
201+ kubectl rollout status daemonset -n gpu-operator nvidia-dcgm-exporter --timeout=5m || echo " Warning: DCGM Exporter not ready yet"
202+
203+ echo " DCGM Exporter pod details for debugging:"
204+ kubectl get pods -n gpu-operator -l app=nvidia-dcgm-exporter -o wide || echo " Warning: No DCGM exporter pods found"
205+ kubectl describe pods -n gpu-operator -l app=nvidia-dcgm-exporter || true
206+
207+ echo " Verifying ServiceMonitor for DCGM in gpu-operator namespace..."
208+ kubectl get servicemonitor -n gpu-operator nvidia-dcgm-exporter -o yaml || echo " Warning: DCGM ServiceMonitor not found"
209+
169210echo " Verifying Allocatable GPUs..."
170211# Wait a bit for nodes to report resources
171212sleep 30
@@ -216,6 +257,55 @@ echo "Waiting for Sample Workload to Complete..."
216257kubectl wait --for=condition=complete job/test-gpu-pod --timeout=5m || true
217258kubectl logs job/test-gpu-pod || echo " Failed to get logs"
218259
260+ echo " Verifying GPU Metrics in Prometheus..."
261+ # Wait for DCGM exporter to start collecting metrics
262+ # Query Prometheus for DCGM GPU metrics with retries
263+ PROM_POD=$( kubectl get pods -n monitoring -l app.kubernetes.io/name=prometheus -o jsonpath=' {.items[0].metadata.name}' )
264+ if [ -n " ${PROM_POD} " ]; then
265+ echo " Querying Prometheus for DCGM_FI_DEV_GPU_UTIL metric (retrying up to 5 times)..."
266+ for i in {1..5}; do
267+ echo " Attempt $i ..."
268+ METRICS=$( kubectl exec -n monitoring " ${PROM_POD} " -c prometheus -- \
269+ wget -qO- ' http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_GPU_UTIL' 2> /dev/null)
270+ if echo " ${METRICS} " | grep -q " result" ; then
271+ echo " Successfully retrieved GPU metrics:"
272+ echo " ${METRICS} " | head -c 500
273+ break
274+ fi
275+ echo " Metrics not yet available, waiting 20s..."
276+ sleep 20
277+ done
278+ else
279+ echo " Warning: Prometheus pod not found"
280+ fi
281+
282+ echo " ----------------------------------------------------------------"
283+ echo " GPU Operator Component Status for Debugging"
284+ echo " ----------------------------------------------------------------"
285+
286+ echo " All GPU Operator pods:"
287+ kubectl get pods -n gpu-operator -o wide || true
288+
289+ echo " "
290+ echo " GPU Operator DaemonSets:"
291+ kubectl get daemonsets -n gpu-operator -o wide || true
292+
293+ echo " "
294+ echo " DCGM Exporter DaemonSet details:"
295+ kubectl describe daemonset -n gpu-operator nvidia-dcgm-exporter || true
296+
297+ echo " "
298+ echo " DCGM Exporter Service:"
299+ kubectl get service -n gpu-operator nvidia-dcgm-exporter -o yaml || echo " No DCGM service found"
300+
301+ echo " "
302+ echo " DCGM Exporter ServiceMonitor:"
303+ kubectl get servicemonitor -n gpu-operator nvidia-dcgm-exporter -o yaml || echo " No ServiceMonitor found"
304+
305+ echo " "
306+ echo " Recent GPU Operator events:"
307+ kubectl get events -n gpu-operator --sort-by=' .lastTimestamp' | tail -20 || true
308+
219309echo " AI Conformance Environment Setup Complete."
220310
221311# Now run the actual AI conformance tests
0 commit comments