Skip to content

Commit 93887f9

Browse files
authored
Merge pull request #18075 from justinsb/aiconformance_schedulingOrchestration_pod_autoscaling
[aiconformance]: test for schedulingOrchestration pod autoscaling
2 parents 80196b9 + a02cacb commit 93887f9

10 files changed

Lines changed: 536 additions & 19 deletions

File tree

tests/e2e/scenarios/ai-conformance/run-test.sh

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,36 @@ helm upgrade -i kube-prometheus-stack \
134134
--set grafana.enabled=false \
135135
--wait
136136

137+
# Prometheus Adapter
138+
# Bridges Prometheus metrics to the Kubernetes custom metrics API (custom.metrics.k8s.io),
139+
# required for HPA to scale on custom metrics like vllm:num_requests_waiting.
140+
# Note: colons in metric names are incompatible with the Kubernetes custom metrics
141+
# API (they appear in URL paths), so the adapter renames the metric with underscores.
142+
echo "Installing prometheus-adapter..."
143+
cat >prometheus-adapter-values.yaml <<'ADAPTER_EOF'
144+
prometheus:
145+
url: http://kube-prometheus-stack-prometheus.monitoring.svc
146+
port: 9090
147+
rules:
148+
custom:
149+
- seriesQuery: '{__name__=~"^vllm:num_requests_waiting$"}'
150+
resources:
151+
overrides:
152+
namespace:
153+
resource: "namespace"
154+
pod:
155+
resource: "pod"
156+
name:
157+
matches: ""
158+
as: "vllm_num_requests_waiting"
159+
metricsQuery: sum by(namespace, pod) (<<.Series>>)
160+
ADAPTER_EOF
161+
helm upgrade -i prometheus-adapter prometheus-community/prometheus-adapter \
162+
--namespace monitoring \
163+
-f prometheus-adapter-values.yaml \
164+
--wait
165+
rm -f prometheus-adapter-values.yaml
166+
137167
# NVIDIA GPU Operator
138168
# Manages the full NVIDIA stack: kernel driver, container toolkit, device plugin, DCGM exporter.
139169
# The driver is installed into /run/nvidia/driver on each node.
@@ -239,6 +269,11 @@ echo "Verifying Prometheus Stack..."
239269
kubectl rollout status deployment -n monitoring kube-prometheus-stack-operator --timeout=5m || echo "Warning: Prometheus Operator not ready yet"
240270
kubectl rollout status statefulset -n monitoring prometheus-kube-prometheus-stack-prometheus --timeout=5m || echo "Warning: Prometheus not ready yet"
241271

272+
echo "Verifying Prometheus Adapter..."
273+
kubectl rollout status deployment -n monitoring prometheus-adapter --timeout=5m || echo "Warning: Prometheus Adapter not ready yet"
274+
echo "Checking custom metrics API registration..."
275+
kubectl get apiservice v1beta1.custom.metrics.k8s.io || echo "Warning: custom metrics API not registered"
276+
242277
echo "Verifying DCGM Exporter in gpu-operator namespace..."
243278
kubectl rollout status daemonset -n gpu-operator nvidia-dcgm-exporter --timeout=5m || echo "Warning: DCGM Exporter not ready yet"
244279

@@ -352,7 +387,7 @@ echo "AI Conformance Environment Setup Complete."
352387

353388
# Now run the actual AI conformance tests
354389
cd "${REPO_ROOT}/tests/e2e/scenarios/ai-conformance/validators"
355-
go test -v ./... -timeout=60m
390+
go test -v -p 1 ./... -timeout=60m
356391

357392
# Compile and write the conformance report
358393
cd "${REPO_ROOT}/tests/e2e/scenarios/ai-conformance"

tests/e2e/scenarios/ai-conformance/validators/kube.go

Lines changed: 57 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,10 @@ import (
2727
"strings"
2828
"time"
2929

30+
apierrors "k8s.io/apimachinery/pkg/api/errors"
3031
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
3132
"k8s.io/apimachinery/pkg/runtime/schema"
33+
"k8s.io/apimachinery/pkg/util/wait"
3234
"k8s.io/client-go/dynamic"
3335
"k8s.io/kops/tests/e2e/scenarios/ai-conformance/testartifacts"
3436

@@ -193,12 +195,29 @@ func (h *ValidatorHarness) TestNamespace() string {
193195

194196
h.t.Cleanup(func() {
195197
ctx := context.WithoutCancel(h.Context())
196-
h.dumpNamespaceResources(ctx, ns)
198+
h.dumpNamespaceResources(ctx, ns, "cluster-info")
199+
200+
startTime := time.Now()
197201

198202
h.Logf("Deleting test namespace %q", ns)
199-
err := h.DynamicClient().Resource(namespaceGVR).Delete(ctx, ns, metav1.DeleteOptions{})
200-
if err != nil {
201-
h.Logf("failed to delete test namespace: %v", err)
203+
if err := h.DynamicClient().Resource(namespaceGVR).Delete(ctx, ns, metav1.DeleteOptions{}); err != nil {
204+
h.Errorf("failed to delete test namespace: %v", err)
205+
}
206+
207+
// Wait for namespace deletion to complete so that we don't have leftover namespaces consuming resources.
208+
if err := wait.PollUntilContextTimeout(ctx, 2*time.Second, 5*time.Minute, false, func(ctx context.Context) (done bool, err error) {
209+
if _, err := h.DynamicClient().Resource(namespaceGVR).Get(ctx, ns, metav1.GetOptions{}); err != nil {
210+
if apierrors.IsNotFound(err) {
211+
return true, nil
212+
}
213+
return false, fmt.Errorf("error checking for namespace deletion: %w", err)
214+
}
215+
return false, nil
216+
}); err != nil {
217+
h.Errorf("error waiting for namespace deletion: %v", err)
218+
h.dumpNamespaceResources(ctx, ns, "namespace-deletion-failure-info")
219+
} else {
220+
h.Logf("Namespace deletion took %s", time.Since(startTime).Round(time.Second))
202221
}
203222
})
204223
}
@@ -225,8 +244,8 @@ func (h *ValidatorHarness) ApplyManifest(defaultNamespace string, manifestPath s
225244
}
226245

227246
// dumpNamespaceResources dumps key resources from the namespace to the artifacts directory for debugging.
228-
func (h *ValidatorHarness) dumpNamespaceResources(ctx context.Context, ns string) {
229-
clusterInfoDir := testartifacts.PathForTestArtifact(h.t, "cluster-info")
247+
func (h *ValidatorHarness) dumpNamespaceResources(ctx context.Context, ns string, outputDir string) {
248+
clusterInfoDir := testartifacts.PathForTestArtifact(h.t, outputDir)
230249
clusterInfoDir = filepath.Join(clusterInfoDir, ns)
231250

232251
if err := os.MkdirAll(clusterInfoDir, 0o755); err != nil {
@@ -244,6 +263,7 @@ func (h *ValidatorHarness) dumpNamespaceResources(ctx context.Context, ns string
244263
// Always include Events, Pods: they are usually not in the manifest, but are often critical for understanding failures.
245264
resourceTypes["Events"] = true
246265
resourceTypes["Pods"] = true
266+
resourceTypes["Nodes"] = true
247267

248268
for resourceType := range resourceTypes {
249269
filename := strings.ToLower(resourceType) + ".yaml"
@@ -252,13 +272,20 @@ func (h *ValidatorHarness) dumpNamespaceResources(ctx context.Context, ns string
252272
}
253273
}
254274

275+
describeResourcesTypes := []string{"nodes", "pods"}
276+
for _, describeResourcesType := range describeResourcesTypes {
277+
filename := strings.ToLower(describeResourcesType) + ".txt"
278+
if err := h.kubectlDescribeResource(ctx, ns, describeResourcesType, filepath.Join(clusterInfoDir, filename)); err != nil {
279+
h.Logf("failed to kubectl describe resource %s: %v", describeResourcesType, err)
280+
}
281+
}
282+
255283
if err := h.dumpPodLogs(ctx, ns, clusterInfoDir); err != nil {
256284
h.Logf("failed to dump pod logs: %v", err)
257285
}
258286
}
259287

260288
// dumpResource runs kubectl get for a resource type and writes the output to a file.
261-
// Errors are logged but do not fail the test.
262289
func (h *ValidatorHarness) dumpResource(ctx context.Context, ns string, resourceType string, outputPath string) error {
263290
args := []string{"get", resourceType}
264291
if ns != "" {
@@ -282,6 +309,29 @@ func (h *ValidatorHarness) dumpResource(ctx context.Context, ns string, resource
282309
return nil
283310
}
284311

312+
// dumpResource runs kubectl describe for a resource type and writes the output to a file.
313+
func (h *ValidatorHarness) kubectlDescribeResource(ctx context.Context, ns string, resourceType string, outputPath string) error {
314+
args := []string{"describe", resourceType}
315+
if ns != "" {
316+
args = append(args, "-n", ns)
317+
}
318+
cmd := exec.CommandContext(ctx, "kubectl", args...)
319+
var stdout bytes.Buffer
320+
var stderr bytes.Buffer
321+
cmd.Stdout = &stdout
322+
cmd.Stderr = &stderr
323+
324+
if err := cmd.Run(); err != nil {
325+
return fmt.Errorf("failed to kubectl describe %s in namespace %s: %v (stderr: %s)", resourceType, ns, err, stderr.String())
326+
}
327+
328+
if err := os.WriteFile(outputPath, stdout.Bytes(), 0o644); err != nil {
329+
return fmt.Errorf("failed to write %s describe output to %s: %w", resourceType, outputPath, err)
330+
}
331+
332+
return nil
333+
}
334+
285335
// dumpPodLogs collects logs from all pods in the namespace and writes them to individual files.
286336
func (h *ValidatorHarness) dumpPodLogs(ctx context.Context, ns string, clusterInfoDir string) error {
287337
podLogsDir := filepath.Join(clusterInfoDir, "pod-logs")

tests/e2e/scenarios/ai-conformance/validators/markdown.go

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -110,10 +110,19 @@ func (o *MarkdownOutput) BeforeShellExec(command string) {
110110

111111
// AfterShellExec writes the result of the executed shell command to the markdown file in a formatted code block.
112112
func (o *MarkdownOutput) AfterShellExec(command string, results *CommandResult) {
113-
o.printf("```bash")
114-
o.printf("%s", results.Stdout())
115-
o.printf("%s", results.Stderr())
116113
o.printf("```\n")
114+
stdout := strings.TrimSpace(results.Stdout())
115+
stderr := strings.TrimSpace(results.Stderr())
116+
if stdout != "" {
117+
o.printf("%s", stdout)
118+
}
119+
if stderr != "" {
120+
if stdout != "" {
121+
o.printf("\n")
122+
}
123+
o.printf("%s", stderr)
124+
}
125+
o.printf("\n```\n")
117126

118127
if results.Err() != nil {
119128
o.printf("Error:\n```\n%v\n```\n", results.Err())

tests/e2e/scenarios/ai-conformance/validators/operator/robust_controller/testdata/rayjob-sample.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ spec:
4747
#pod template
4848
template:
4949
spec:
50+
terminationGracePeriodSeconds: 5
5051
containers:
5152
- name: ray-head
5253
image: rayproject/ray:2.46.0

tests/e2e/scenarios/ai-conformance/validators/scheduling-orchestration/gangscheduling_test.go renamed to tests/e2e/scenarios/ai-conformance/validators/scheduling-orchestration/gangscheduling/gangscheduling_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
1414
limitations under the License.
1515
*/
1616

17-
package dra_support
17+
package gangscheduling
1818

1919
import (
2020
"fmt"
@@ -23,9 +23,9 @@ import (
2323
"k8s.io/kops/tests/e2e/scenarios/ai-conformance/validators"
2424
)
2525

26-
// TestGangScheduling_ViaKueue corresponds to the schedulingOrchestration/gang_scheduling scenario,
26+
// TestSchedulingOrchestration_GangScheduling_ViaKueue corresponds to the schedulingOrchestration/gang_scheduling scenario,
2727
// for the case that the vendor chooses to demonstrate gang scheduling support via Kueue.
28-
func TestGangScheduling_ViaKueue(t *testing.T) {
28+
func TestSchedulingOrchestration_GangScheduling_ViaKueue(t *testing.T) {
2929
// Description:
3030
// The platform must allow for the installation and successful operation of at least one gang scheduling solution that ensures all-or-nothing scheduling for distributed AI workloads (e.g. Kueue, Volcano, etc.) To be conformant, the vendor must demonstrate that their platform can successfully run at least one such solution.
3131

@@ -44,7 +44,7 @@ func TestGangScheduling_ViaKueue(t *testing.T) {
4444

4545
h.Logf("Creating a Kueue Job that requires gang scheduling")
4646
ns := h.TestNamespace()
47-
h.ApplyManifest(ns, "testdata/gangscheduling/gangscheduling-kueue.yaml")
47+
h.ApplyManifest(ns, "testdata/gangscheduling-kueue.yaml")
4848

4949
h.Logf("Waiting for Job to complete")
5050
h.ShellExec(fmt.Sprintf("kubectl wait --namespace %s --for=condition=complete job/%s --timeout=300s", ns, jobName))

tests/e2e/scenarios/ai-conformance/validators/scheduling-orchestration/testdata/gangscheduling/gangscheduling-kueue.yaml renamed to tests/e2e/scenarios/ai-conformance/validators/scheduling-orchestration/gangscheduling/testdata/gangscheduling-kueue.yaml

File renamed without changes.

0 commit comments

Comments
 (0)