Skip to content

Commit dae411d

Browse files
authored
Merge pull request #18018 from ameukam/test-ai-conformance-metrics
tests/ai-conformance: Install prometheus for metrics
2 parents ffc7f4c + 742f3b3 commit dae411d

1 file changed

Lines changed: 108 additions & 18 deletions

File tree

tests/e2e/scenarios/ai-conformance/run-test.sh

Lines changed: 108 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,6 @@ if [[ "${AVAILABILITY}" == "false" ]]; then
4646
fi
4747
rm -f "${SCENARIO_ROOT}/tools/check-aws-availability/check-aws-availability"
4848

49-
5049
kops-acquire-latest
5150

5251
# Cluster Configuration
@@ -85,6 +84,9 @@ EOF
8584

8685
${KOPS} update cluster --name "${CLUSTER_NAME}" --yes --admin
8786

87+
echo "Listing node with their labels..."
88+
kubectl get nodes --show-labels
89+
8890
echo "----------------------------------------------------------------"
8991
echo "Deploying AI Conformance Components"
9092
echo "----------------------------------------------------------------"
@@ -97,18 +99,45 @@ kubectl apply -f https://github.com/kubernetes-sigs/gateway-api/releases/downloa
9799
echo "Installing cert-manager..."
98100
kubectl apply --server-side -f https://github.com/cert-manager/cert-manager/releases/download/v1.19.2/cert-manager.yaml
99101

100-
# Setup helm repo for NVIDIA GPU Operator and DRA Driver
102+
# Setup helm repos for monitoring and NVIDIA components
101103
helm repo add nvidia https://helm.ngc.nvidia.com/nvidia
104+
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
102105
helm repo update
103106

107+
# Prometheus Stack (kube-prometheus-stack)
108+
# Provides Prometheus, Grafana, Alertmanager, and ServiceMonitor CRDs
109+
# Must be installed before dcgm-exporter so ServiceMonitor CRDs are available
110+
echo "Installing kube-prometheus-stack..."
111+
helm upgrade -i kube-prometheus-stack \
112+
oci://ghcr.io/prometheus-community/charts/kube-prometheus-stack \
113+
--namespace monitoring \
114+
--create-namespace \
115+
--set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \
116+
--set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false \
117+
--set grafana.enabled=false \
118+
--wait
119+
104120
# NVIDIA GPU Operator
105-
# Manages the full NVIDIA stack: kernel driver, container toolkit, device plugin.
121+
# Manages the full NVIDIA stack: kernel driver, container toolkit, device plugin, DCGM exporter.
106122
# The driver is installed into /run/nvidia/driver on each node.
123+
# DCGM exporter is enabled with ServiceMonitor for Prometheus integration.
124+
echo "Installing NVIDIA GPU Operator with DCGM exporter..."
107125
helm upgrade -i nvidia-gpu-operator --wait \
108-
-n gpu-operator --create-namespace \
109-
nvidia/gpu-operator \
110-
--version=v25.10.1 \
111-
--wait
126+
-n gpu-operator --create-namespace \
127+
nvidia/gpu-operator \
128+
--version=v25.10.1 \
129+
--set dcgmExporter.enabled=true \
130+
--set dcgmExporter.serviceMonitor.enabled=true \
131+
--set dcgmExporter.serviceMonitor.additionalLabels.release=kube-prometheus-stack
132+
133+
echo "GPU Operator installation complete. Checking component status..."
134+
kubectl get pods -n gpu-operator -o wide || echo "Warning: GPU Operator pods not ready yet"
135+
136+
echo "DCGM Exporter pod status in gpu-operator namespace..."
137+
kubectl get pods -n gpu-operator -l app=nvidia-dcgm-exporter -o wide || echo "Warning: DCGM exporter pods not found"
138+
139+
echo "GPU Operator deployment details for debugging:"
140+
kubectl get all -n gpu-operator || true
112141

113142
PATH="$(pwd):$PATH"
114143
export PATH
@@ -117,7 +146,7 @@ export PATH
117146
# Uses the driver installed by GPU Operator at /run/nvidia/driver (the default).
118147
echo "Installing NVIDIA DRA Driver..."
119148

120-
cat > values.yaml <<EOF
149+
cat >values.yaml <<EOF
121150
# The driver daemonset needs a toleration for the nvidia.com/gpu taint
122151
kubeletPlugin:
123152
tolerations:
@@ -127,15 +156,14 @@ kubeletPlugin:
127156
EOF
128157

129158
helm upgrade -i nvidia-dra-driver-gpu nvidia/nvidia-dra-driver-gpu \
130-
--version="25.12.0" \
131-
--create-namespace \
132-
--namespace nvidia-dra-driver-gpu \
133-
--set resources.gpus.enabled=true \
134-
--set nvidiaDriverRoot=/run/nvidia/driver \
135-
--set gpuResourcesEnabledOverride=true \
136-
-f values.yaml \
137-
--wait
138-
159+
--version="25.12.0" \
160+
--create-namespace \
161+
--namespace nvidia-dra-driver-gpu \
162+
--set resources.gpus.enabled=true \
163+
--set nvidiaDriverRoot=/run/nvidia/driver \
164+
--set gpuResourcesEnabledOverride=true \
165+
-f values.yaml \
166+
--wait
139167

140168
# KubeRay
141169
echo "Installing KubeRay Operator..."
@@ -145,7 +173,6 @@ kubectl apply --server-side -k "github.com/ray-project/kuberay/ray-operator/conf
145173
echo "Installing Kueue..."
146174
kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.14.8/manifests.yaml
147175

148-
149176
echo "----------------------------------------------------------------"
150177
echo "Verifying Cluster and Components"
151178
echo "----------------------------------------------------------------"
@@ -166,6 +193,20 @@ kubectl rollout status deployment -n kuberay-system kuberay-operator --timeout=5
166193
echo "Verifying Gateway API..."
167194
kubectl get gatewayclass || echo "Warning: GatewayClass not found"
168195

196+
echo "Verifying Prometheus Stack..."
197+
kubectl rollout status deployment -n monitoring kube-prometheus-stack-operator --timeout=5m || echo "Warning: Prometheus Operator not ready yet"
198+
kubectl rollout status statefulset -n monitoring prometheus-kube-prometheus-stack-prometheus --timeout=5m || echo "Warning: Prometheus not ready yet"
199+
200+
echo "Verifying DCGM Exporter in gpu-operator namespace..."
201+
kubectl rollout status daemonset -n gpu-operator nvidia-dcgm-exporter --timeout=5m || echo "Warning: DCGM Exporter not ready yet"
202+
203+
echo "DCGM Exporter pod details for debugging:"
204+
kubectl get pods -n gpu-operator -l app=nvidia-dcgm-exporter -o wide || echo "Warning: No DCGM exporter pods found"
205+
kubectl describe pods -n gpu-operator -l app=nvidia-dcgm-exporter || true
206+
207+
echo "Verifying ServiceMonitor for DCGM in gpu-operator namespace..."
208+
kubectl get servicemonitor -n gpu-operator nvidia-dcgm-exporter -o yaml || echo "Warning: DCGM ServiceMonitor not found"
209+
169210
echo "Verifying Allocatable GPUs..."
170211
# Wait a bit for nodes to report resources
171212
sleep 30
@@ -216,6 +257,55 @@ echo "Waiting for Sample Workload to Complete..."
216257
kubectl wait --for=condition=complete job/test-gpu-pod --timeout=5m || true
217258
kubectl logs job/test-gpu-pod || echo "Failed to get logs"
218259

260+
echo "Verifying GPU Metrics in Prometheus..."
261+
# Wait for DCGM exporter to start collecting metrics
262+
# Query Prometheus for DCGM GPU metrics with retries
263+
PROM_POD=$(kubectl get pods -n monitoring -l app.kubernetes.io/name=prometheus -o jsonpath='{.items[0].metadata.name}')
264+
if [ -n "${PROM_POD}" ]; then
265+
echo "Querying Prometheus for DCGM_FI_DEV_GPU_UTIL metric (retrying up to 5 times)..."
266+
for i in {1..5}; do
267+
echo "Attempt $i..."
268+
METRICS=$(kubectl exec -n monitoring "${PROM_POD}" -c prometheus -- \
269+
wget -qO- 'http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_GPU_UTIL' 2>/dev/null)
270+
if echo "${METRICS}" | grep -q "result"; then
271+
echo "Successfully retrieved GPU metrics:"
272+
echo "${METRICS}" | head -c 500
273+
break
274+
fi
275+
echo "Metrics not yet available, waiting 20s..."
276+
sleep 20
277+
done
278+
else
279+
echo "Warning: Prometheus pod not found"
280+
fi
281+
282+
echo "----------------------------------------------------------------"
283+
echo "GPU Operator Component Status for Debugging"
284+
echo "----------------------------------------------------------------"
285+
286+
echo "All GPU Operator pods:"
287+
kubectl get pods -n gpu-operator -o wide || true
288+
289+
echo ""
290+
echo "GPU Operator DaemonSets:"
291+
kubectl get daemonsets -n gpu-operator -o wide || true
292+
293+
echo ""
294+
echo "DCGM Exporter DaemonSet details:"
295+
kubectl describe daemonset -n gpu-operator nvidia-dcgm-exporter || true
296+
297+
echo ""
298+
echo "DCGM Exporter Service:"
299+
kubectl get service -n gpu-operator nvidia-dcgm-exporter -o yaml || echo "No DCGM service found"
300+
301+
echo ""
302+
echo "DCGM Exporter ServiceMonitor:"
303+
kubectl get servicemonitor -n gpu-operator nvidia-dcgm-exporter -o yaml || echo "No ServiceMonitor found"
304+
305+
echo ""
306+
echo "Recent GPU Operator events:"
307+
kubectl get events -n gpu-operator --sort-by='.lastTimestamp' | tail -20 || true
308+
219309
echo "AI Conformance Environment Setup Complete."
220310

221311
# Now run the actual AI conformance tests

0 commit comments

Comments
 (0)