Skip to content

Commit b974d5e

Browse files
committed
tests/ai-conformance: create simple smoketest for CUDA via DRA
1 parent dcb96ed commit b974d5e

3 files changed

Lines changed: 97 additions & 3 deletions

File tree

tests/e2e/scenarios/ai-conformance/validators/accelerators/dra_support/dra_cuda_test.go

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,8 @@ package dra_support
1818

1919
import (
2020
"fmt"
21-
"strings"
2221
"testing"
2322

24-
"k8s.io/apimachinery/pkg/runtime/schema"
2523
"k8s.io/kops/tests/e2e/scenarios/ai-conformance/validators"
2624
)
2725

@@ -33,6 +31,22 @@ func TestDRAWorks(t *testing.T) {
3331
h.Logf("## Listing device classes")
3432
deviceClasses := h.ListDeviceClasses()
3533
for _, deviceClass := range deviceClasses {
36-
h.Logf("* %s", deviceClass.Name)
34+
h.Logf("* %s", deviceClass.Name())
3735
}
36+
37+
h.Logf("## Listing resource slices")
38+
resourceSlices := h.ListResourceSlices()
39+
for _, resourceSlice := range resourceSlices {
40+
h.Logf("* %s", resourceSlice.Name())
41+
}
42+
43+
if !h.HasDeviceClass("gpu.nvidia.com") {
44+
t.Skipf("gpu.nvidia.com device class not found; skipping")
45+
}
46+
47+
h.Logf("## Run cuda-smoketest")
48+
ns := "default"
49+
h.ShellExec(fmt.Sprintf("kubectl apply --namespace %s -f testdata/cuda-smoketest.yaml", ns))
50+
h.ShellExec(fmt.Sprintf("kubectl wait --for=condition=complete --namespace %s job/cuda-smoketest --timeout=5m", ns))
51+
h.ShellExec(fmt.Sprintf("kubectl logs --namespace %s job/cuda-smoketest", ns))
3852
}
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# Create a ResourceClaim and Job to test DRA
2+
apiVersion: resource.k8s.io/v1
3+
kind: ResourceClaim
4+
metadata:
5+
name: cuda-smoketest
6+
spec:
7+
devices:
8+
requests:
9+
- name: single-gpu
10+
exactly:
11+
deviceClassName: gpu.nvidia.com
12+
allocationMode: ExactCount
13+
count: 1
14+
15+
---
16+
17+
apiVersion: batch/v1
18+
kind: Job
19+
metadata:
20+
name: cuda-smoketest
21+
spec:
22+
template:
23+
spec:
24+
restartPolicy: Never
25+
tolerations:
26+
- key: "nvidia.com/gpu"
27+
operator: "Exists"
28+
effect: "NoSchedule"
29+
containers:
30+
- name: test
31+
image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0-ubuntu22.04
32+
command: ["/bin/sh", "-c"]
33+
args: ["/cuda-samples/vectorAdd"]
34+
resources:
35+
claims:
36+
- name: gpu
37+
resourceClaims:
38+
- name: gpu
39+
resourceClaimName: cuda-smoketest

tests/e2e/scenarios/ai-conformance/validators/kube.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2525
)
2626

27+
// DynamicClient returns a dynamic client for the kubernetes cluster.
2728
func (h *ValidatorHarness) DynamicClient() dynamic.Interface {
2829
if h.dynamicClient == nil {
2930
dynamicClient, err := dynamic.NewForConfig(h.restConfig)
@@ -35,6 +36,7 @@ func (h *ValidatorHarness) DynamicClient() dynamic.Interface {
3536
return h.dynamicClient
3637
}
3738

39+
// DeviceClass is a wrapper around the DRA DeviceClass type.
3840
type DeviceClass struct {
3941
u *unstructured.Unstructured
4042
}
@@ -49,6 +51,7 @@ var deviceClassGVR = schema.GroupVersionResource{
4951
Resource: "deviceclasses",
5052
}
5153

54+
// ListDeviceClasses lists all device classes in the cluster.
5255
func (h *ValidatorHarness) ListDeviceClasses() []*DeviceClass {
5356
objectList, err := h.DynamicClient().Resource(deviceClassGVR).List(h.Context(), metav1.ListOptions{})
5457
if err != nil {
@@ -60,3 +63,41 @@ func (h *ValidatorHarness) ListDeviceClasses() []*DeviceClass {
6063
}
6164
return out
6265
}
66+
67+
// HasDeviceClass returns true if a device class with the given name exists.
68+
func (h *ValidatorHarness) HasDeviceClass(name string) bool {
69+
for _, deviceClass := range h.ListDeviceClasses() {
70+
if deviceClass.Name() == name {
71+
return true
72+
}
73+
}
74+
return false
75+
}
76+
77+
// ResourceSlice is a wrapper around the DRA ResourceSlice type.
78+
type ResourceSlice struct {
79+
u *unstructured.Unstructured
80+
}
81+
82+
func (d *ResourceSlice) Name() string {
83+
return d.u.GetName()
84+
}
85+
86+
var resourceSliceGVR = schema.GroupVersionResource{
87+
Group: "resource.k8s.io",
88+
Version: "v1",
89+
Resource: "resourceslices",
90+
}
91+
92+
// ListResourceSlices lists all resource slices in the cluster.
93+
func (h *ValidatorHarness) ListResourceSlices() []*ResourceSlice {
94+
objectList, err := h.DynamicClient().Resource(resourceSliceGVR).List(h.Context(), metav1.ListOptions{})
95+
if err != nil {
96+
h.Fatalf("failed to list resource slices: %v", err)
97+
}
98+
var out []*ResourceSlice
99+
for i := range objectList.Items {
100+
out = append(out, &ResourceSlice{u: &objectList.Items[i]})
101+
}
102+
return out
103+
}

0 commit comments

Comments
 (0)