Skip to content

Commit b494e18

Browse files
authored
Merge pull request #18055 from justinsb/aiconformance_security
[aiconformance] Add tests for security / isolation
2 parents 974fa89 + 62ee4be commit b494e18

8 files changed

Lines changed: 367 additions & 5 deletions

File tree

tests/e2e/scenarios/ai-conformance/run-test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ spec:
7373
image: 099720109477/ubuntu/images/hvm-ssd-gp3/ubuntu-noble-24.04-amd64-server-20251212
7474
machineType: g6.xlarge
7575
maxSize: 3
76-
minSize: 1
76+
minSize: 2
7777
role: Node
7878
rootVolumeSize: 48
7979
subnets:

tests/e2e/scenarios/ai-conformance/validators/accelerators/dra_support/dra_cuda_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,8 @@ func TestDRAWorks(t *testing.T) {
4747
}
4848

4949
h.Logf("## Run cuda-smoketest")
50-
ns := "default"
51-
h.ShellExec(fmt.Sprintf("kubectl apply --namespace %s -f testdata/cuda-smoketest.yaml", ns))
50+
ns := h.TestNamespace()
51+
h.ApplyManifest(ns, "testdata/cuda-smoketest.yaml")
5252
h.ShellExec(fmt.Sprintf("kubectl wait --for=condition=complete --namespace %s job/cuda-smoketest --timeout=5m", ns))
5353
h.ShellExec(fmt.Sprintf("kubectl logs --namespace %s job/cuda-smoketest", ns))
5454
}

tests/e2e/scenarios/ai-conformance/validators/kube.go

Lines changed: 75 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,12 @@ limitations under the License.
1717
package validators
1818

1919
import (
20+
"bytes"
21+
"context"
2022
"fmt"
23+
"os"
24+
"os/exec"
25+
"path/filepath"
2126
"strings"
2227
"time"
2328

@@ -164,9 +169,12 @@ func (h *ValidatorHarness) TestNamespace() string {
164169
defer h.mutex.Unlock()
165170

166171
if h.testNamespace == "" {
167-
prefix := strings.ToLower(h.t.Name())
172+
prefix := h.t.Name()
173+
prefix = prefix[strings.LastIndex(prefix, "/")+1:]
174+
prefix = strings.ToLower(prefix)
168175
prefix = strings.ReplaceAll(prefix, "/", "-")
169176
prefix = strings.ReplaceAll(prefix, "_", "-")
177+
170178
ns := fmt.Sprintf("%s-%d", prefix, time.Now().Unix())
171179

172180
nsObj := &unstructured.Unstructured{}
@@ -182,8 +190,11 @@ func (h *ValidatorHarness) TestNamespace() string {
182190
h.testNamespace = ns
183191

184192
h.t.Cleanup(func() {
193+
h.dumpNamespaceResources(ns)
194+
185195
h.Logf("Deleting test namespace %q", ns)
186-
err := h.DynamicClient().Resource(namespaceGVR).Delete(h.Context(), ns, metav1.DeleteOptions{})
196+
ctx := context.WithoutCancel(h.Context())
197+
err := h.DynamicClient().Resource(namespaceGVR).Delete(ctx, ns, metav1.DeleteOptions{})
187198
if err != nil {
188199
h.Logf("failed to delete test namespace: %v", err)
189200
}
@@ -192,3 +203,65 @@ func (h *ValidatorHarness) TestNamespace() string {
192203

193204
return h.testNamespace
194205
}
206+
207+
// ApplyManifest applies a Kubernetes manifest from the given file path to the specified namespace.
208+
// We use kubectl so that the output is clear and in theory someone could run the same commands themselves to debug.
209+
func (h *ValidatorHarness) ApplyManifest(namespace string, manifestPath string) {
210+
h.Logf("Applying manifest %q to namespace %q", manifestPath, namespace)
211+
h.ShellExec(fmt.Sprintf("kubectl apply -n %s -f %s", namespace, manifestPath))
212+
}
213+
214+
// dumpNamespaceResources dumps key resources from the namespace to the artifacts directory for debugging.
215+
func (h *ValidatorHarness) dumpNamespaceResources(ns string) {
216+
artifactsDir := os.Getenv("ARTIFACTS")
217+
if artifactsDir == "" {
218+
artifactsDir = "_artifacts"
219+
}
220+
221+
testName := strings.ReplaceAll(h.t.Name(), "/", "_")
222+
clusterInfoDir := filepath.Join(artifactsDir, "per-test", testName, "cluster-info", ns)
223+
if err := os.MkdirAll(clusterInfoDir, 0o755); err != nil {
224+
h.Logf("failed to create cluster-info directory: %v", err)
225+
return
226+
}
227+
228+
resourceTypes := []string{
229+
"pods",
230+
"jobs",
231+
"deployments",
232+
"statefulsets",
233+
"services",
234+
"events",
235+
}
236+
237+
for _, resourceType := range resourceTypes {
238+
if err := h.dumpResource(ns, resourceType, filepath.Join(clusterInfoDir, resourceType+".yaml")); err != nil {
239+
h.Logf("failed to dump resource %s: %v", resourceType, err)
240+
}
241+
}
242+
}
243+
244+
// dumpResource runs kubectl get for a resource type and writes the output to a file.
245+
// Errors are logged but do not fail the test.
246+
func (h *ValidatorHarness) dumpResource(ns string, resourceType string, outputPath string) error {
247+
args := []string{"get", resourceType}
248+
if ns != "" {
249+
args = append(args, "-n", ns)
250+
}
251+
args = append(args, "-o", "yaml")
252+
cmd := exec.CommandContext(context.WithoutCancel(h.Context()), "kubectl", args...)
253+
var stdout bytes.Buffer
254+
var stderr bytes.Buffer
255+
cmd.Stdout = &stdout
256+
cmd.Stderr = &stderr
257+
258+
if err := cmd.Run(); err != nil {
259+
return fmt.Errorf("failed to dump %s in namespace %s: %v (stderr: %s)", resourceType, ns, err, stderr.String())
260+
}
261+
262+
if err := os.WriteFile(outputPath, stdout.Bytes(), 0o644); err != nil {
263+
return fmt.Errorf("failed to write %s dump to %s: %w", resourceType, outputPath, err)
264+
}
265+
266+
return nil
267+
}

tests/e2e/scenarios/ai-conformance/validators/output.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package validators
1919
import (
2020
"fmt"
2121
"io"
22+
"testing"
2223
)
2324

2425
// OutputSink is an interface for writing output from the validators.
@@ -61,6 +62,29 @@ func (h *ValidatorHarness) Fatalf(format string, args ...interface{}) {
6162
h.t.Fatalf(format, args...)
6263
}
6364

65+
// Errorf is like t.Errorf, but also writes to the sinks.
66+
func (h *ValidatorHarness) Errorf(format string, args ...interface{}) {
67+
s := fmt.Sprintf(format, args...)
68+
69+
h.output.WriteText("ERROR: " + s)
70+
h.t.Errorf(format, args...)
71+
}
72+
73+
// Run is like t.Run, but creates a sub-harness that shares the output.
74+
func (h *ValidatorHarness) Run(name string, testFunc func(h *ValidatorHarness)) {
75+
h.t.Run(name, func(t *testing.T) {
76+
subHarness := &ValidatorHarness{
77+
t: t,
78+
79+
// Share most of the state with the parent harness, but use the sub-test's *testing.T and a new context.
80+
output: h.output,
81+
dynamicClient: h.dynamicClient,
82+
restConfig: h.restConfig,
83+
}
84+
testFunc(subHarness)
85+
})
86+
}
87+
6488
// Success is like Logf, but indicates a successful check.
6589
func (h *ValidatorHarness) Success(format string, args ...interface{}) {
6690
s := fmt.Sprintf(format, args...)
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
/*
2+
Copyright The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package secure_accelerator_access
18+
19+
import (
20+
"fmt"
21+
"strings"
22+
"testing"
23+
24+
"k8s.io/kops/tests/e2e/scenarios/ai-conformance/validators"
25+
)
26+
27+
// TestSecurity_SecureAcceleratorAccess corresponds to the security/secure_accelerator_access conformance requirement
28+
func TestSecurity_SecureAcceleratorAccess(t *testing.T) {
29+
// Description:
30+
// Ensure that access to accelerators from within containers is properly isolated and mediated by the Kubernetes resource management framework (device plugin or DRA) and container runtime, preventing unauthorized access or interference between workloads.
31+
h := validators.NewValidatorHarness(t)
32+
33+
h.Logf("# Secure Accelerator Access")
34+
35+
score := 0
36+
37+
h.Logf("## Checking that GPUs are available if requested")
38+
39+
h.Run("accelerator-requested", func(h *validators.ValidatorHarness) {
40+
ns := h.TestNamespace()
41+
42+
h.ApplyManifest(ns, "testdata/accelerator-requested.yaml")
43+
h.ShellExec(fmt.Sprintf("kubectl wait -n %s --for=condition=complete job/accelerator-requested --timeout=60s", ns))
44+
45+
logs := h.ShellExec(fmt.Sprintf("kubectl logs -n %s job/accelerator-requested", ns))
46+
if !strings.Contains(logs.Stdout(), "<product_brand>NVIDIA</product_brand>") {
47+
h.Errorf("Expected to find nvidia GPUs available when requested, but did not find them in the logs: %s", logs.Stdout())
48+
} else {
49+
h.Success("GPUs were requested, and nvidia-smi reported available GPUs.")
50+
score++
51+
}
52+
})
53+
54+
h.Logf("## Checking that GPUs are not available if not requested")
55+
h.Run("accelerator-not-requested", func(h *validators.ValidatorHarness) {
56+
ns := h.TestNamespace()
57+
58+
h.ApplyManifest(ns, "testdata/accelerator-not-requested.yaml")
59+
h.ShellExec(fmt.Sprintf("kubectl wait -n %s --for=condition=complete job/accelerator-not-requested --timeout=60s", ns))
60+
61+
logs := h.ShellExec(fmt.Sprintf("kubectl logs -n %s job/accelerator-not-requested", ns))
62+
if !strings.Contains(logs.Stdout(), "nvidia-smi failed (as expected)") {
63+
h.Errorf("Expected nvidia-smi to fail when GPUs are not requested, but found them in the logs: %s", logs.Stdout())
64+
} else {
65+
h.Success("No GPUs were requested, and nvidia-smi did not report any GPUs.")
66+
}
67+
})
68+
69+
h.Logf("## Pods with GPU requests should be isolated from each other")
70+
h.Run("accelerator-isolation", func(h *validators.ValidatorHarness) {
71+
ns := h.TestNamespace()
72+
73+
h.ApplyManifest(ns, "testdata/accelerator-isolation.yaml")
74+
h.ShellExec(fmt.Sprintf("kubectl wait -n %s --for=condition=available deployment/accelerator-isolation-1 --timeout=60s", ns))
75+
h.ShellExec(fmt.Sprintf("kubectl wait -n %s --for=condition=available deployment/accelerator-isolation-2 --timeout=60s", ns))
76+
77+
logs1 := h.ShellExec(fmt.Sprintf("kubectl logs -n %s deployment/accelerator-isolation-1", ns))
78+
logs2 := h.ShellExec(fmt.Sprintf("kubectl logs -n %s deployment/accelerator-isolation-2", ns))
79+
80+
uuid1 := extractGPUUUID(logs1.Stdout())
81+
uuid2 := extractGPUUUID(logs2.Stdout())
82+
83+
if uuid1 == "" {
84+
h.Errorf("Failed to extract GPU UUID from logs of accelerator-isolation-1:\n%s", logs1.Stdout())
85+
} else if uuid2 == "" {
86+
h.Errorf("Failed to extract GPU UUID from logs of accelerator-isolation-2:\n%s", logs2.Stdout())
87+
} else if uuid1 == uuid2 {
88+
h.Errorf("Expected that pods with GPU requests would be isolated from each other, but both pods saw the same GPU UUID: %s", uuid1)
89+
} else {
90+
h.Success("Pods with GPU requests were isolated from each other as expected.")
91+
score++
92+
}
93+
})
94+
95+
if score == 3 {
96+
h.RecordConformance("security/secure_accelerator_access")
97+
}
98+
}
99+
100+
// extractGPUUUID is a helper function to extract the GPU UUID from nvidia-smi XML output in the logs.
101+
func extractGPUUUID(logs string) string {
102+
lines := strings.Split(logs, "\n")
103+
for _, line := range lines {
104+
line = strings.TrimSpace(line)
105+
if strings.HasPrefix(line, "<uuid>") && strings.HasSuffix(line, "</uuid>") {
106+
value := strings.TrimPrefix(line, "<uuid>")
107+
value = strings.TrimSuffix(value, "</uuid>")
108+
return value
109+
}
110+
}
111+
return ""
112+
}
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
apiVersion: resource.k8s.io/v1
2+
kind: ResourceClaimTemplate
3+
metadata:
4+
name: single-gpu
5+
spec:
6+
spec:
7+
devices:
8+
requests:
9+
- name: single-gpu
10+
exactly:
11+
deviceClassName: gpu.nvidia.com
12+
allocationMode: ExactCount
13+
count: 1
14+
15+
---
16+
17+
apiVersion: apps/v1
18+
kind: Deployment
19+
metadata:
20+
name: accelerator-isolation-1
21+
spec:
22+
replicas: 1
23+
selector:
24+
matchLabels:
25+
app: accelerator-isolation-1
26+
template:
27+
metadata:
28+
labels:
29+
app: accelerator-isolation-1
30+
spec:
31+
containers:
32+
- name: default
33+
image: ubuntu:22.04
34+
command: ["bash", "-c"]
35+
args: ["while [ 1 ]; do date; nvidia-smi -q --xml-format; sleep 60; done"]
36+
resources:
37+
claims:
38+
- name: single-gpu
39+
# We do request a GPU, so we should be able to access it.
40+
resourceClaims:
41+
- name: single-gpu
42+
resourceClaimTemplateName: single-gpu
43+
# We need to tolerate the GPU taint to be scheduled onto a GPU node, even if we are requesting the GPU via a ResourceClaim.
44+
tolerations:
45+
- key: "nvidia.com/gpu"
46+
operator: "Exists"
47+
effect: "NoSchedule"
48+
49+
---
50+
51+
apiVersion: apps/v1
52+
kind: Deployment
53+
metadata:
54+
name: accelerator-isolation-2
55+
spec:
56+
replicas: 1
57+
selector:
58+
matchLabels:
59+
app: accelerator-isolation-2
60+
template:
61+
metadata:
62+
labels:
63+
app: accelerator-isolation-2
64+
spec:
65+
containers:
66+
- name: default
67+
image: ubuntu:22.04
68+
command: ["bash", "-c"]
69+
args: ["while [ 1 ]; do date; nvidia-smi -q --xml-format; sleep 60; done"]
70+
resources:
71+
claims:
72+
- name: single-gpu
73+
# We do request a GPU, so we should be able to access it.
74+
resourceClaims:
75+
- name: single-gpu
76+
resourceClaimTemplateName: single-gpu
77+
# We need to tolerate the GPU taint to be scheduled onto a GPU node, even if we are requesting the GPU via a ResourceClaim.
78+
tolerations:
79+
- key: "nvidia.com/gpu"
80+
operator: "Exists"
81+
effect: "NoSchedule"
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
apiVersion: batch/v1
2+
kind: Job
3+
metadata:
4+
name: accelerator-not-requested
5+
spec:
6+
template:
7+
metadata:
8+
labels:
9+
app: dra-gpu-example
10+
spec:
11+
restartPolicy: Never
12+
# We want to be scheduled onto a node with GPUs
13+
affinity:
14+
nodeAffinity:
15+
requiredDuringSchedulingIgnoredDuringExecution:
16+
nodeSelectorTerms:
17+
- matchExpressions:
18+
- key: nvidia.com/gpu.present
19+
operator: In
20+
values:
21+
- "true"
22+
tolerations:
23+
- key: "nvidia.com/gpu"
24+
operator: "Exists"
25+
effect: "NoSchedule"
26+
containers:
27+
- name: default
28+
image: ubuntu:22.04
29+
command: ["bash", "-c"]
30+
args: ['nvidia-smi -q --xml-format || echo "nvidia-smi failed (as expected)"']
31+
# We do _not_ request a GPU, so we should not be able to access it.

0 commit comments

Comments
 (0)