Skip to content

Commit ac99f83

Browse files
authored
Merge pull request #18064 from justinsb/aiconformance_accelerator_metrics
[aiconformance]: Add accelerator metrics validator test
2 parents 9a8c64f + f9d270e commit ac99f83

File tree

1 file changed

+131
-0
lines changed

1 file changed

+131
-0
lines changed
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
/*
2+
Copyright The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package ai_inference
18+
19+
import (
20+
"fmt"
21+
"strings"
22+
"testing"
23+
"time"
24+
25+
"k8s.io/kops/tests/e2e/scenarios/ai-conformance/validators"
26+
)
27+
28+
// TestObservability_AcceleratorMetrics corresponds to the observability/accelerator_metrics conformance requirement.
29+
func TestObservability_AcceleratorMetrics(t *testing.T) {
30+
// Description:
31+
// For supported accelerator types, the platform must allow for the installation and successful operation of at least one accelerator metrics solution
32+
// that exposes fine-grained performance metrics via a standardized, machine-readable metrics endpoint.
33+
// This must include a core set of metrics for per-accelerator utilization and memory usage.
34+
// Additionally, other relevant metrics such as temperature, power draw, and interconnect bandwidth should be exposed
35+
// if the underlying hardware or virtualization layer makes them available.
36+
// The list of metrics should align with emerging standards, such as OpenTelemetry metrics, to ensure interoperability.
37+
// The platform may provide a managed solution, but this is not required for conformance."
38+
39+
h := validators.NewValidatorHarness(t)
40+
41+
h.Logf("# Observability: Accelerator Metrics")
42+
43+
h.Run("nvidia-metrics", func(h *validators.ValidatorHarness) {
44+
h.Logf("## Verify NVIDIA Metrics")
45+
46+
h.ShellExec("kubectl get service -n gpu-operator")
47+
48+
ns := h.TestNamespace()
49+
50+
requiredMetrics := []string{
51+
"DCGM_FI_DEV_GPU_TEMP",
52+
"DCGM_FI_DEV_POWER_USAGE",
53+
"DCGM_FI_DEV_GPU_UTIL",
54+
"DCGM_FI_DEV_FB_USED",
55+
}
56+
57+
var metricClasses map[string]bool
58+
var metricsOutput string
59+
60+
// Retry scraping metrics, as the DCGM exporter may not have completed its first collection cycle yet.
61+
const maxAttempts = 5
62+
for attempt := 1; attempt <= maxAttempts; attempt++ {
63+
podName := fmt.Sprintf("scrape-accelerator-metrics-%d", attempt)
64+
65+
h.ShellExec(fmt.Sprintf(
66+
"kubectl run %s -n %s --image=registry.k8s.io/e2e-test-images/agnhost:2.39 --restart=Never --command -- curl -sS http://nvidia-dcgm-exporter.gpu-operator.svc.cluster.local:9400/metrics",
67+
podName, ns,
68+
))
69+
//h.ShellExec(fmt.Sprintf("kubectl wait -n %s pod/%s --for=condition=Ready --timeout=60s", ns, jobName))
70+
h.ShellExec(fmt.Sprintf("kubectl wait -n %s pod/%s --for=jsonpath='{.status.phase}'=Succeeded --timeout=120s", ns, podName))
71+
72+
logs := h.ShellExec(fmt.Sprintf("kubectl logs -n %s %s", ns, podName))
73+
metricsOutput = logs.Stdout()
74+
75+
metricClasses = make(map[string]bool)
76+
for _, line := range strings.Split(metricsOutput, "\n") {
77+
line = strings.TrimSpace(line)
78+
// Ignore comment lines
79+
if strings.HasPrefix(line, "#") {
80+
continue
81+
}
82+
fields := strings.Fields(line)
83+
// Ignore lines that don't have at least a metric name and value
84+
if len(fields) < 2 {
85+
continue
86+
}
87+
metric := fields[0]
88+
89+
// Extract out the metric class, ignoring any labels. For example, from "DCGM_FI_DEV_GPU_TEMP{gpu=\"0\"}" we want "DCGM_FI_DEV_GPU_TEMP".
90+
metricClass := metric
91+
if prefix, _, ok := strings.Cut(metric, "{"); ok {
92+
metricClass = prefix
93+
}
94+
95+
// Record the metric class as found
96+
metricClasses[metricClass] = true
97+
}
98+
99+
allFound := true
100+
for _, m := range requiredMetrics {
101+
if !metricClasses[m] {
102+
allFound = false
103+
break
104+
}
105+
}
106+
if allFound {
107+
h.Logf("All required metrics found on attempt %d", attempt)
108+
break
109+
}
110+
111+
if attempt < maxAttempts {
112+
h.Logf("Attempt %d: not all required metrics found, retrying in 10s...", attempt)
113+
time.Sleep(10 * time.Second)
114+
}
115+
}
116+
117+
h.Logf("Received metrics:\n%s", metricsOutput)
118+
119+
for _, m := range requiredMetrics {
120+
if !metricClasses[m] {
121+
h.Errorf("Did not find expected metric: %s", m)
122+
} else {
123+
h.Logf("Found expected metric: %s", m)
124+
}
125+
}
126+
})
127+
128+
if h.AllPassed() {
129+
h.RecordConformance("observability/accelerator_metrics")
130+
}
131+
}

0 commit comments

Comments
 (0)