Merge pull request #18055 from justinsb/aiconformance_security

k8s-ci-robot · web-flow · commit b494e18bafba · 2026-03-13T12:45:34.000+05:30
[aiconformance] Add tests for security / isolation
diff --git a/tests/e2e/scenarios/ai-conformance/run-test.sh b/tests/e2e/scenarios/ai-conformance/run-test.sh
@@ -73,7 +73,7 @@ spec:
   image: 099720109477/ubuntu/images/hvm-ssd-gp3/ubuntu-noble-24.04-amd64-server-20251212
   machineType: g6.xlarge
   maxSize: 3
-  minSize: 1
+  minSize: 2
   role: Node
   rootVolumeSize: 48
   subnets:
diff --git a/tests/e2e/scenarios/ai-conformance/validators/accelerators/dra_support/dra_cuda_test.go b/tests/e2e/scenarios/ai-conformance/validators/accelerators/dra_support/dra_cuda_test.go
@@ -47,8 +47,8 @@ func TestDRAWorks(t *testing.T) {
 	}
 
 	h.Logf("## Run cuda-smoketest")
-	ns := "default"
-	h.ShellExec(fmt.Sprintf("kubectl apply --namespace %s -f testdata/cuda-smoketest.yaml", ns))
+	ns := h.TestNamespace()
+	h.ApplyManifest(ns, "testdata/cuda-smoketest.yaml")
 	h.ShellExec(fmt.Sprintf("kubectl wait --for=condition=complete --namespace %s job/cuda-smoketest --timeout=5m", ns))
 	h.ShellExec(fmt.Sprintf("kubectl logs --namespace %s job/cuda-smoketest", ns))
 }
diff --git a/tests/e2e/scenarios/ai-conformance/validators/kube.go b/tests/e2e/scenarios/ai-conformance/validators/kube.go
@@ -17,7 +17,12 @@ limitations under the License.
 package validators
 
 import (
+	"bytes"
+	"context"
 	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
 	"strings"
 	"time"
 
@@ -164,9 +169,12 @@ func (h *ValidatorHarness) TestNamespace() string {
 	defer h.mutex.Unlock()
 
 	if h.testNamespace == "" {
-		prefix := strings.ToLower(h.t.Name())
+		prefix := h.t.Name()
+		prefix = prefix[strings.LastIndex(prefix, "/")+1:]
+		prefix = strings.ToLower(prefix)
 		prefix = strings.ReplaceAll(prefix, "/", "-")
 		prefix = strings.ReplaceAll(prefix, "_", "-")
+
 		ns := fmt.Sprintf("%s-%d", prefix, time.Now().Unix())
 
 		nsObj := &unstructured.Unstructured{}
@@ -182,8 +190,11 @@ func (h *ValidatorHarness) TestNamespace() string {
 		h.testNamespace = ns
 
 		h.t.Cleanup(func() {
+			h.dumpNamespaceResources(ns)
+
 			h.Logf("Deleting test namespace %q", ns)
-			err := h.DynamicClient().Resource(namespaceGVR).Delete(h.Context(), ns, metav1.DeleteOptions{})
+			ctx := context.WithoutCancel(h.Context())
+			err := h.DynamicClient().Resource(namespaceGVR).Delete(ctx, ns, metav1.DeleteOptions{})
 			if err != nil {
 				h.Logf("failed to delete test namespace: %v", err)
 			}
@@ -192,3 +203,65 @@ func (h *ValidatorHarness) TestNamespace() string {
 
 	return h.testNamespace
 }
+
+// ApplyManifest applies a Kubernetes manifest from the given file path to the specified namespace.
+// We use kubectl so that the output is clear and in theory someone could run the same commands themselves to debug.
+func (h *ValidatorHarness) ApplyManifest(namespace string, manifestPath string) {
+	h.Logf("Applying manifest %q to namespace %q", manifestPath, namespace)
+	h.ShellExec(fmt.Sprintf("kubectl apply -n %s -f %s", namespace, manifestPath))
+}
+
+// dumpNamespaceResources dumps key resources from the namespace to the artifacts directory for debugging.
+func (h *ValidatorHarness) dumpNamespaceResources(ns string) {
+	artifactsDir := os.Getenv("ARTIFACTS")
+	if artifactsDir == "" {
+		artifactsDir = "_artifacts"
+	}
+
+	testName := strings.ReplaceAll(h.t.Name(), "/", "_")
+	clusterInfoDir := filepath.Join(artifactsDir, "per-test", testName, "cluster-info", ns)
+	if err := os.MkdirAll(clusterInfoDir, 0o755); err != nil {
+		h.Logf("failed to create cluster-info directory: %v", err)
+		return
+	}
+
+	resourceTypes := []string{
+		"pods",
+		"jobs",
+		"deployments",
+		"statefulsets",
+		"services",
+		"events",
+	}
+
+	for _, resourceType := range resourceTypes {
+		if err := h.dumpResource(ns, resourceType, filepath.Join(clusterInfoDir, resourceType+".yaml")); err != nil {
+			h.Logf("failed to dump resource %s: %v", resourceType, err)
+		}
+	}
+}
+
+// dumpResource runs kubectl get for a resource type and writes the output to a file.
+// Errors are logged but do not fail the test.
+func (h *ValidatorHarness) dumpResource(ns string, resourceType string, outputPath string) error {
+	args := []string{"get", resourceType}
+	if ns != "" {
+		args = append(args, "-n", ns)
+	}
+	args = append(args, "-o", "yaml")
+	cmd := exec.CommandContext(context.WithoutCancel(h.Context()), "kubectl", args...)
+	var stdout bytes.Buffer
+	var stderr bytes.Buffer
+	cmd.Stdout = &stdout
+	cmd.Stderr = &stderr
+
+	if err := cmd.Run(); err != nil {
+		return fmt.Errorf("failed to dump %s in namespace %s: %v (stderr: %s)", resourceType, ns, err, stderr.String())
+	}
+
+	if err := os.WriteFile(outputPath, stdout.Bytes(), 0o644); err != nil {
+		return fmt.Errorf("failed to write %s dump to %s: %w", resourceType, outputPath, err)
+	}
+
+	return nil
+}
diff --git a/tests/e2e/scenarios/ai-conformance/validators/output.go b/tests/e2e/scenarios/ai-conformance/validators/output.go
@@ -19,6 +19,7 @@ package validators
 import (
 	"fmt"
 	"io"
+	"testing"
 )
 
 // OutputSink is an interface for writing output from the validators.
@@ -61,6 +62,29 @@ func (h *ValidatorHarness) Fatalf(format string, args ...interface{}) {
 	h.t.Fatalf(format, args...)
 }
 
+// Errorf is like t.Errorf, but also writes to the sinks.
+func (h *ValidatorHarness) Errorf(format string, args ...interface{}) {
+	s := fmt.Sprintf(format, args...)
+
+	h.output.WriteText("ERROR: " + s)
+	h.t.Errorf(format, args...)
+}
+
+// Run is like t.Run, but creates a sub-harness that shares the output.
+func (h *ValidatorHarness) Run(name string, testFunc func(h *ValidatorHarness)) {
+	h.t.Run(name, func(t *testing.T) {
+		subHarness := &ValidatorHarness{
+			t: t,
+
+			// Share most of the state with the parent harness, but use the sub-test's *testing.T and a new context.
+			output:        h.output,
+			dynamicClient: h.dynamicClient,
+			restConfig:    h.restConfig,
+		}
+		testFunc(subHarness)
+	})
+}
+
 // Success is like Logf, but indicates a successful check.
 func (h *ValidatorHarness) Success(format string, args ...interface{}) {
 	s := fmt.Sprintf(format, args...)
diff --git a/tests/e2e/scenarios/ai-conformance/validators/security/secure_accelerator_access/secure_accelerator_access_test.go b/tests/e2e/scenarios/ai-conformance/validators/security/secure_accelerator_access/secure_accelerator_access_test.go
@@ -0,0 +1,112 @@
+/*
+Copyright The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package secure_accelerator_access
+
+import (
+	"fmt"
+	"strings"
+	"testing"
+
+	"k8s.io/kops/tests/e2e/scenarios/ai-conformance/validators"
+)
+
+// TestSecurity_SecureAcceleratorAccess corresponds to the security/secure_accelerator_access conformance requirement
+func TestSecurity_SecureAcceleratorAccess(t *testing.T) {
+	// Description:
+	// Ensure that access to accelerators from within containers is properly isolated and mediated by the Kubernetes resource management framework (device plugin or DRA) and container runtime, preventing unauthorized access or interference between workloads.
+	h := validators.NewValidatorHarness(t)
+
+	h.Logf("# Secure Accelerator Access")
+
+	score := 0
+
+	h.Logf("## Checking that GPUs are available if requested")
+
+	h.Run("accelerator-requested", func(h *validators.ValidatorHarness) {
+		ns := h.TestNamespace()
+
+		h.ApplyManifest(ns, "testdata/accelerator-requested.yaml")
+		h.ShellExec(fmt.Sprintf("kubectl wait -n %s --for=condition=complete job/accelerator-requested --timeout=60s", ns))
+
+		logs := h.ShellExec(fmt.Sprintf("kubectl logs -n %s job/accelerator-requested", ns))
+		if !strings.Contains(logs.Stdout(), "<product_brand>NVIDIA</product_brand>") {
+			h.Errorf("Expected to find nvidia GPUs available when requested, but did not find them in the logs: %s", logs.Stdout())
+		} else {
+			h.Success("GPUs were requested, and nvidia-smi reported available GPUs.")
+			score++
+		}
+	})
+
+	h.Logf("## Checking that GPUs are not available if not requested")
+	h.Run("accelerator-not-requested", func(h *validators.ValidatorHarness) {
+		ns := h.TestNamespace()
+
+		h.ApplyManifest(ns, "testdata/accelerator-not-requested.yaml")
+		h.ShellExec(fmt.Sprintf("kubectl wait -n %s --for=condition=complete job/accelerator-not-requested --timeout=60s", ns))
+
+		logs := h.ShellExec(fmt.Sprintf("kubectl logs -n %s job/accelerator-not-requested", ns))
+		if !strings.Contains(logs.Stdout(), "nvidia-smi failed (as expected)") {
+			h.Errorf("Expected nvidia-smi to fail when GPUs are not requested, but found them in the logs: %s", logs.Stdout())
+		} else {
+			h.Success("No GPUs were requested, and nvidia-smi did not report any GPUs.")
+		}
+	})
+
+	h.Logf("## Pods with GPU requests should be isolated from each other")
+	h.Run("accelerator-isolation", func(h *validators.ValidatorHarness) {
+		ns := h.TestNamespace()
+
+		h.ApplyManifest(ns, "testdata/accelerator-isolation.yaml")
+		h.ShellExec(fmt.Sprintf("kubectl wait -n %s --for=condition=available deployment/accelerator-isolation-1 --timeout=60s", ns))
+		h.ShellExec(fmt.Sprintf("kubectl wait -n %s --for=condition=available deployment/accelerator-isolation-2 --timeout=60s", ns))
+
+		logs1 := h.ShellExec(fmt.Sprintf("kubectl logs -n %s deployment/accelerator-isolation-1", ns))
+		logs2 := h.ShellExec(fmt.Sprintf("kubectl logs -n %s deployment/accelerator-isolation-2", ns))
+
+		uuid1 := extractGPUUUID(logs1.Stdout())
+		uuid2 := extractGPUUUID(logs2.Stdout())
+
+		if uuid1 == "" {
+			h.Errorf("Failed to extract GPU UUID from logs of accelerator-isolation-1:\n%s", logs1.Stdout())
+		} else if uuid2 == "" {
+			h.Errorf("Failed to extract GPU UUID from logs of accelerator-isolation-2:\n%s", logs2.Stdout())
+		} else if uuid1 == uuid2 {
+			h.Errorf("Expected that pods with GPU requests would be isolated from each other, but both pods saw the same GPU UUID: %s", uuid1)
+		} else {
+			h.Success("Pods with GPU requests were isolated from each other as expected.")
+			score++
+		}
+	})
+
+	if score == 3 {
+		h.RecordConformance("security/secure_accelerator_access")
+	}
+}
+
+// extractGPUUUID is a helper function to extract the GPU UUID from nvidia-smi XML output in the logs.
+func extractGPUUUID(logs string) string {
+	lines := strings.Split(logs, "\n")
+	for _, line := range lines {
+		line = strings.TrimSpace(line)
+		if strings.HasPrefix(line, "<uuid>") && strings.HasSuffix(line, "</uuid>") {
+			value := strings.TrimPrefix(line, "<uuid>")
+			value = strings.TrimSuffix(value, "</uuid>")
+			return value
+		}
+	}
+	return ""
+}
diff --git a/tests/e2e/scenarios/ai-conformance/validators/security/secure_accelerator_access/testdata/accelerator-isolation.yaml b/tests/e2e/scenarios/ai-conformance/validators/security/secure_accelerator_access/testdata/accelerator-isolation.yaml
@@ -0,0 +1,81 @@
+apiVersion: resource.k8s.io/v1
+kind: ResourceClaimTemplate
+metadata:
+  name: single-gpu
+spec:
+  spec:
+    devices:
+      requests:
+      - name: single-gpu
+        exactly:
+          deviceClassName: gpu.nvidia.com
+          allocationMode: ExactCount
+          count: 1
+
+---
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: accelerator-isolation-1
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: accelerator-isolation-1
+  template:
+    metadata:
+      labels:
+        app: accelerator-isolation-1
+    spec:
+      containers:
+      - name: default
+        image: ubuntu:22.04
+        command: ["bash", "-c"]
+        args: ["while [ 1 ]; do date; nvidia-smi -q --xml-format; sleep 60; done"]
+        resources:
+          claims:
+          - name: single-gpu
+      # We do request a GPU, so we should be able to access it.
+      resourceClaims:
+      - name: single-gpu
+        resourceClaimTemplateName: single-gpu
+      # We need to tolerate the GPU taint to be scheduled onto a GPU node, even if we are requesting the GPU via a ResourceClaim.
+      tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+
+---
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: accelerator-isolation-2
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: accelerator-isolation-2
+  template:
+    metadata:
+      labels:
+        app: accelerator-isolation-2
+    spec:
+      containers:
+      - name: default
+        image: ubuntu:22.04
+        command: ["bash", "-c"]
+        args: ["while [ 1 ]; do date; nvidia-smi -q --xml-format; sleep 60; done"]
+        resources:
+          claims:
+          - name: single-gpu
+      # We do request a GPU, so we should be able to access it.
+      resourceClaims:
+      - name: single-gpu
+        resourceClaimTemplateName: single-gpu
+      # We need to tolerate the GPU taint to be scheduled onto a GPU node, even if we are requesting the GPU via a ResourceClaim.
+      tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
diff --git a/tests/e2e/scenarios/ai-conformance/validators/security/secure_accelerator_access/testdata/accelerator-not-requested.yaml b/tests/e2e/scenarios/ai-conformance/validators/security/secure_accelerator_access/testdata/accelerator-not-requested.yaml
@@ -0,0 +1,31 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: accelerator-not-requested
+spec:
+  template:
+    metadata:
+      labels:
+        app: dra-gpu-example
+    spec:
+      restartPolicy: Never
+      # We want to be scheduled onto a node with GPUs
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: nvidia.com/gpu.present
+                operator: In
+                values:
+                - "true"
+      tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+      containers:
+      - name: default
+        image: ubuntu:22.04
+        command: ["bash", "-c"]
+        args: ['nvidia-smi -q --xml-format || echo "nvidia-smi failed (as expected)"']
+        # We do _not_ request a GPU, so we should not be able to access it.
diff --git a/tests/e2e/scenarios/ai-conformance/validators/security/secure_accelerator_access/testdata/accelerator-requested.yaml b/tests/e2e/scenarios/ai-conformance/validators/security/secure_accelerator_access/testdata/accelerator-requested.yaml

Original file line number	Diff line number	Diff line change
`@@ -47,8 +47,8 @@ func TestDRAWorks(t *testing.T) {`
`47`	`47`	`}`
`48`	`48`
`49`	`49`	`h.Logf("## Run cuda-smoketest")`
`50`		`- ns := "default"`
`51`		`- h.ShellExec(fmt.Sprintf("kubectl apply --namespace %s -f testdata/cuda-smoketest.yaml", ns))`
	`50`	`+ ns := h.TestNamespace()`
	`51`	`+ h.ApplyManifest(ns, "testdata/cuda-smoketest.yaml")`
`52`	`52`	`h.ShellExec(fmt.Sprintf("kubectl wait --for=condition=complete --namespace %s job/cuda-smoketest --timeout=5m", ns))`
`53`	`53`	`h.ShellExec(fmt.Sprintf("kubectl logs --namespace %s job/cuda-smoketest", ns))`
`54`	`54`	`}`