Skip to content

Commit 4607dcc

Browse files
authored
Merge pull request #18056 from justinsb/aiconformance_operator
[aiconformance]: Add robust_controller test using KubeRay
2 parents 3911794 + 37749f9 commit 4607dcc

5 files changed

Lines changed: 296 additions & 21 deletions

File tree

tests/e2e/scenarios/ai-conformance/run-test.sh

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -95,10 +95,15 @@ echo "----------------------------------------------------------------"
9595
echo "Installing Gateway API CRDs v1.2.0..."
9696
kubectl apply -f https://github.com/kubernetes-sigs/gateway-api/releases/download/v1.2.0/standard-install.yaml
9797

98-
# cert-manager: required for KubeRay webhooks
98+
# cert-manager
9999
echo "Installing cert-manager..."
100100
kubectl apply --server-side -f https://github.com/cert-manager/cert-manager/releases/download/v1.19.2/cert-manager.yaml
101101

102+
echo "Waiting for cert-manager to be ready..."
103+
kubectl rollout status deployment -n cert-manager cert-manager --timeout=5m
104+
kubectl rollout status deployment -n cert-manager cert-manager-webhook --timeout=5m
105+
kubectl rollout status deployment -n cert-manager cert-manager-cainjector --timeout=5m
106+
102107
# Setup helm repos for monitoring and NVIDIA components
103108
helm repo add nvidia https://helm.ngc.nvidia.com/nvidia
104109
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
@@ -166,8 +171,17 @@ helm upgrade -i nvidia-dra-driver-gpu nvidia/nvidia-dra-driver-gpu \
166171
--wait
167172

168173
# KubeRay
174+
# Use the Helm chart; the kustomize "default-with-webhooks" overlay has incomplete RBAC
175+
# (missing cluster-scope secrets permission), causing the operator to crash-loop.
176+
# The Helm chart does not support webhooks, so we install without them.
169177
echo "Installing KubeRay Operator..."
170-
kubectl apply --server-side -k "github.com/ray-project/kuberay/ray-operator/config/default-with-webhooks?ref=v1.5.0"
178+
helm repo add kuberay https://ray-project.github.io/kuberay-helm/
179+
helm repo update kuberay
180+
helm upgrade -i kuberay-operator kuberay/kuberay-operator \
181+
--version 1.5.0 \
182+
--namespace ray-system \
183+
--create-namespace \
184+
--wait
171185

172186
# Kueue
173187
echo "Installing Kueue..."
@@ -188,7 +202,7 @@ echo "Verifying Kueue..."
188202
kubectl rollout status deployment -n kueue-system kueue-controller-manager --timeout=5m || echo "Warning: Kueue not ready yet"
189203

190204
echo "Verifying KubeRay..."
191-
kubectl rollout status deployment -n kuberay-system kuberay-operator --timeout=5m || echo "Warning: KubeRay not ready yet"
205+
kubectl rollout status deployment -n ray-system kuberay-operator --timeout=5m || echo "Warning: KubeRay not ready yet"
192206

193207
echo "Verifying Gateway API..."
194208
kubectl get gatewayclass || echo "Warning: GatewayClass not found"

tests/e2e/scenarios/ai-conformance/validators/kube.go

Lines changed: 51 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package validators
1919
import (
2020
"bytes"
2121
"context"
22+
"errors"
2223
"fmt"
2324
"os"
2425
"os/exec"
@@ -190,10 +191,10 @@ func (h *ValidatorHarness) TestNamespace() string {
190191
h.testNamespace = ns
191192

192193
h.t.Cleanup(func() {
193-
h.dumpNamespaceResources(ns)
194+
ctx := context.WithoutCancel(h.Context())
195+
h.dumpNamespaceResources(ctx, ns)
194196

195197
h.Logf("Deleting test namespace %q", ns)
196-
ctx := context.WithoutCancel(h.Context())
197198
err := h.DynamicClient().Resource(namespaceGVR).Delete(ctx, ns, metav1.DeleteOptions{})
198199
if err != nil {
199200
h.Logf("failed to delete test namespace: %v", err)
@@ -212,7 +213,7 @@ func (h *ValidatorHarness) ApplyManifest(namespace string, manifestPath string)
212213
}
213214

214215
// dumpNamespaceResources dumps key resources from the namespace to the artifacts directory for debugging.
215-
func (h *ValidatorHarness) dumpNamespaceResources(ns string) {
216+
func (h *ValidatorHarness) dumpNamespaceResources(ctx context.Context, ns string) {
216217
artifactsDir := os.Getenv("ARTIFACTS")
217218
if artifactsDir == "" {
218219
artifactsDir = "_artifacts"
@@ -235,21 +236,25 @@ func (h *ValidatorHarness) dumpNamespaceResources(ns string) {
235236
}
236237

237238
for _, resourceType := range resourceTypes {
238-
if err := h.dumpResource(ns, resourceType, filepath.Join(clusterInfoDir, resourceType+".yaml")); err != nil {
239+
if err := h.dumpResource(ctx, ns, resourceType, filepath.Join(clusterInfoDir, resourceType+".yaml")); err != nil {
239240
h.Logf("failed to dump resource %s: %v", resourceType, err)
240241
}
241242
}
243+
244+
if err := h.dumpPodLogs(ctx, ns, clusterInfoDir); err != nil {
245+
h.Logf("failed to dump pod logs: %v", err)
246+
}
242247
}
243248

244249
// dumpResource runs kubectl get for a resource type and writes the output to a file.
245250
// Errors are logged but do not fail the test.
246-
func (h *ValidatorHarness) dumpResource(ns string, resourceType string, outputPath string) error {
251+
func (h *ValidatorHarness) dumpResource(ctx context.Context, ns string, resourceType string, outputPath string) error {
247252
args := []string{"get", resourceType}
248253
if ns != "" {
249254
args = append(args, "-n", ns)
250255
}
251256
args = append(args, "-o", "yaml")
252-
cmd := exec.CommandContext(context.WithoutCancel(h.Context()), "kubectl", args...)
257+
cmd := exec.CommandContext(ctx, "kubectl", args...)
253258
var stdout bytes.Buffer
254259
var stderr bytes.Buffer
255260
cmd.Stdout = &stdout
@@ -265,3 +270,43 @@ func (h *ValidatorHarness) dumpResource(ns string, resourceType string, outputPa
265270

266271
return nil
267272
}
273+
274+
// dumpPodLogs collects logs from all pods in the namespace and writes them to individual files.
275+
func (h *ValidatorHarness) dumpPodLogs(ctx context.Context, ns string, clusterInfoDir string) error {
276+
podLogsDir := filepath.Join(clusterInfoDir, "pod-logs")
277+
278+
// List pods in the namespace
279+
cmd := exec.CommandContext(ctx, "kubectl", "get", "pods", "-n", ns, "-o", "jsonpath={.items[*].metadata.name}")
280+
var stdout bytes.Buffer
281+
cmd.Stdout = &stdout
282+
var stderr bytes.Buffer
283+
cmd.Stderr = &stderr
284+
if err := cmd.Run(); err != nil {
285+
return fmt.Errorf("failed to list pods for log collection in namespace %s (stderr: %s): %w", ns, stderr.String(), err)
286+
}
287+
288+
podNames := strings.Fields(stdout.String())
289+
290+
if err := os.MkdirAll(podLogsDir, 0o755); err != nil {
291+
return fmt.Errorf("failed to create pod-logs directory: %v", err)
292+
}
293+
294+
var errs []error
295+
for _, podName := range podNames {
296+
logCmd := exec.CommandContext(ctx, "kubectl", "logs", "-n", ns, podName, "--all-containers", "--ignore-errors")
297+
var logOut bytes.Buffer
298+
logCmd.Stdout = &logOut
299+
var logErr bytes.Buffer
300+
logCmd.Stderr = &logErr
301+
if err := logCmd.Run(); err != nil {
302+
errs = append(errs, fmt.Errorf("failed to get logs for pod %s (stderr: %s): %w", podName, logErr.String(), err))
303+
continue
304+
}
305+
logPath := filepath.Join(podLogsDir, podName+".log")
306+
if err := os.WriteFile(logPath, logOut.Bytes(), 0o644); err != nil {
307+
errs = append(errs, fmt.Errorf("failed to write logs for pod %s to %s: %w", podName, logPath, err))
308+
}
309+
}
310+
311+
return errors.Join(errs...)
312+
}
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
/*
2+
Copyright The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package robust_controller
18+
19+
import (
20+
"fmt"
21+
"strings"
22+
"testing"
23+
24+
"k8s.io/kops/tests/e2e/scenarios/ai-conformance/validators"
25+
)
26+
27+
// TestOperator_RobustController_ViaKuberay corresponds to the operator/robust_controller conformance requirement,
28+
// tested using the KubeRay operator as an example of a complex AI operator with a CRD.
29+
func TestOperator_RobustController_ViaKuberay(t *testing.T) {
30+
// Description:
31+
// The platform must prove that at least one complex AI operator with a CRD (e.g., Ray, Kubeflow) can be installed and functions reliably. This includes verifying that the operator's pods run correctly, its webhooks are operational, and its custom resources can be reconciled.
32+
33+
h := validators.NewValidatorHarness(t)
34+
35+
if !h.HasCRD("rayjobs.ray.io") {
36+
h.Skip("Ray CRDs not found, skipping test")
37+
}
38+
39+
h.Logf("# Robust Controller (with KubeRay)")
40+
41+
h.Logf("## Verify KubeRay with a sample RayJob")
42+
{
43+
// This is based on https://docs.ray.io/en/latest/cluster/kubernetes/getting-started/rayjob-quick-start.html#kuberay-rayjob-quickstart
44+
45+
ns := h.TestNamespace()
46+
47+
h.ApplyManifest(ns, "testdata/rayjob-sample.yaml")
48+
h.ShellExec(fmt.Sprintf("kubectl wait -n %s --for='jsonpath={.status.jobDeploymentStatus}=Complete' rayjob/rayjob-sample --timeout=300s", ns))
49+
50+
logs := h.ShellExec(fmt.Sprintf("kubectl logs -n %s -l=job-name=rayjob-sample", ns))
51+
52+
succeeded := false
53+
for _, line := range strings.Split(logs.Stdout(), "\n") {
54+
if strings.Contains(line, "SUCC cli.py") && strings.Contains(line, "Job 'rayjob-sample-") && strings.Contains(line, " succeeded") {
55+
h.Success("Found succeeded message in logs, indicating the RayJob completed successfully.")
56+
succeeded = true
57+
break
58+
}
59+
}
60+
if !succeeded {
61+
h.Fatalf("Did not find succeeded message in logs: %s", logs.Stdout())
62+
}
63+
}
64+
65+
h.RecordConformance("operator/robust_controller")
66+
}
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
# Based on https://raw.githubusercontent.com/ray-project/kuberay/v1.5.1/ray-operator/config/samples/ray-job.sample.yaml
2+
3+
apiVersion: ray.io/v1
4+
kind: RayJob
5+
metadata:
6+
name: rayjob-sample
7+
spec:
8+
# submissionMode specifies how RayJob submits the Ray job to the RayCluster.
9+
# The default value is "K8sJobMode", meaning RayJob will submit the Ray job via a submitter Kubernetes Job.
10+
# The alternative value is "HTTPMode", indicating that KubeRay will submit the Ray job by sending an HTTP request to the RayCluster.
11+
# submissionMode: "K8sJobMode"
12+
entrypoint: python /home/ray/samples/sample_code.py
13+
# shutdownAfterJobFinishes specifies whether the RayCluster should be deleted after the RayJob finishes. Default is false.
14+
# shutdownAfterJobFinishes: false
15+
16+
# ttlSecondsAfterFinished specifies the number of seconds after which the RayCluster will be deleted after the RayJob finishes.
17+
# ttlSecondsAfterFinished: 10
18+
19+
# activeDeadlineSeconds is the duration in seconds that the RayJob may be active before
20+
# KubeRay actively tries to terminate the RayJob; value must be positive integer.
21+
# activeDeadlineSeconds: 120
22+
23+
# RuntimeEnvYAML represents the runtime environment configuration provided as a multi-line YAML string.
24+
# See https://docs.ray.io/en/latest/ray-core/handling-dependencies.html for details.
25+
# (New in KubeRay version 1.0.)
26+
runtimeEnvYAML: |
27+
pip:
28+
- requests==2.26.0
29+
- pendulum==2.1.2
30+
env_vars:
31+
counter_name: "test_counter"
32+
33+
# Suspend specifies whether the RayJob controller should create a RayCluster instance.
34+
# If a job is applied with the suspend field set to true, the RayCluster will not be created and we will wait for the transition to false.
35+
# If the RayCluster is already created, it will be deleted. In the case of transition to false, a new RayCluster will be created.
36+
# suspend: false
37+
38+
# rayClusterSpec specifies the RayCluster instance to be created by the RayJob controller.
39+
rayClusterSpec:
40+
rayVersion: '2.46.0' # should match the Ray version in the image of the containers
41+
# Ray head pod template
42+
headGroupSpec:
43+
# The `rayStartParams` are used to configure the `ray start` command.
44+
# See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay.
45+
# See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`.
46+
rayStartParams: {}
47+
#pod template
48+
template:
49+
spec:
50+
containers:
51+
- name: ray-head
52+
image: rayproject/ray:2.46.0
53+
ports:
54+
- containerPort: 6379
55+
name: gcs-server
56+
- containerPort: 8265 # Ray dashboard
57+
name: dashboard
58+
- containerPort: 10001
59+
name: client
60+
resources:
61+
limits:
62+
cpu: "1"
63+
requests:
64+
cpu: "200m"
65+
volumeMounts:
66+
- mountPath: /home/ray/samples
67+
name: code-sample
68+
volumes:
69+
# You set volumes at the Pod level, then mount them into containers inside that Pod
70+
- name: code-sample
71+
configMap:
72+
# Provide the name of the ConfigMap you want to mount.
73+
name: ray-job-code-sample
74+
# An array of keys from the ConfigMap to create as files
75+
items:
76+
- key: sample_code.py
77+
path: sample_code.py
78+
workerGroupSpecs:
79+
# the pod replicas in this group typed worker
80+
- replicas: 1
81+
minReplicas: 1
82+
maxReplicas: 5
83+
# logical group name, for this called small-group, also can be functional
84+
groupName: small-group
85+
# The `rayStartParams` are used to configure the `ray start` command.
86+
# See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay.
87+
# See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`.
88+
rayStartParams: {}
89+
#pod template
90+
template:
91+
spec:
92+
containers:
93+
- name: ray-worker # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc'
94+
image: rayproject/ray:2.46.0
95+
resources:
96+
limits:
97+
cpu: "1"
98+
requests:
99+
cpu: "200m"
100+
101+
# SubmitterPodTemplate is the template for the pod that will run the `ray job submit` command against the RayCluster.
102+
# If SubmitterPodTemplate is specified, the first container is assumed to be the submitter container.
103+
# submitterPodTemplate:
104+
# spec:
105+
# restartPolicy: Never
106+
# containers:
107+
# - name: my-custom-rayjob-submitter-pod
108+
# image: rayproject/ray:2.46.0
109+
# # If Command is not specified, the correct command will be supplied at runtime using the RayJob spec `entrypoint` field.
110+
# # Specifying Command is not recommended.
111+
# # command: ["sh", "-c", "ray job submit --address=http://$RAY_DASHBOARD_ADDRESS --submission-id=$RAY_JOB_SUBMISSION_ID -- echo hello world"]
112+
113+
114+
######################Ray code sample#################################
115+
# this sample is from https://docs.ray.io/en/latest/cluster/job-submission.html#quick-start-example
116+
# it is mounted into the container and executed to show the Ray job at work
117+
---
118+
apiVersion: v1
119+
kind: ConfigMap
120+
metadata:
121+
name: ray-job-code-sample
122+
data:
123+
sample_code.py: |
124+
import ray
125+
import os
126+
import requests
127+
128+
ray.init()
129+
130+
@ray.remote
131+
class Counter:
132+
def __init__(self):
133+
# Used to verify runtimeEnv
134+
self.name = os.getenv("counter_name")
135+
assert self.name == "test_counter"
136+
self.counter = 0
137+
138+
def inc(self):
139+
self.counter += 1
140+
141+
def get_counter(self):
142+
return "{} got {}".format(self.name, self.counter)
143+
144+
counter = Counter.remote()
145+
146+
for _ in range(5):
147+
ray.get(counter.inc.remote())
148+
print(ray.get(counter.get_counter.remote()))
149+
150+
# Verify that the correct runtime env was used for the job.
151+
assert requests.__version__ == "2.26.0"

0 commit comments

Comments
 (0)