Merge pull request #3890 from justinsb/clusterloader_pod_startup_latency

k8s-ci-robot · web-flow · commit 0d72d5640dbc · 2026-04-01T21:47:09.000+05:30
clusterloader2: create pod-startup-latency test
diff --git a/clusterloader2/testing/pod-startup-latency/config.yaml b/clusterloader2/testing/pod-startup-latency/config.yaml
@@ -0,0 +1,93 @@
+# ASSUMPTIONS:
+# - This test is designed for 1-node cluster.
+# - Pods take less than 1 second to start, so we can launch them at a rate of 1 pod per second without creating a backlog.
+
+#Constants
+{{$POD_COUNT := DefaultParam .POD_COUNT 30}}
+{{$POD_THROUGHPUT := DefaultParam .POD_THROUGHPUT 1}} # We expect the pod to launch in less than one second, so launching more than 1 pod per second would create a backlog and test a different behaviour.
+{{$CONTAINER_IMAGE := DefaultParam .CONTAINER_IMAGE "registry.k8s.io/pause:3.9"}}
+{{$POD_STARTUP_LATENCY_THRESHOLD := DefaultParam .POD_STARTUP_LATENCY_THRESHOLD "5s"}}
+{{$OPERATION_TIMEOUT := DefaultParam .OPERATION_TIMEOUT "15m"}}
+
+name: pod-startup-latency
+namespace:
+  number: {{$POD_COUNT}}
+tuningSets:
+- name: UniformQPS
+  qpsLoad:
+    qps: {{$POD_THROUGHPUT}}
+steps:
+- name: Starting measurements
+  measurements:
+  - Identifier: APIResponsivenessPrometheusSimple
+    Method: APIResponsivenessPrometheus
+    Params:
+      action: start
+  - Identifier: PodStartupLatency
+    Method: PodStartupLatency
+    Params:
+      action: start
+      labelSelector: group = latency
+      threshold: {{$POD_STARTUP_LATENCY_THRESHOLD}}
+- name: Starting pods measurements
+  measurements:
+  - Identifier: WaitForRunningLatencyRCs
+    Method: WaitForControlledPodsRunning
+    Params:
+      action: start
+      apiVersion: v1
+      kind: ReplicationController
+      labelSelector: group = latency
+      operationTimeout: {{$OPERATION_TIMEOUT}}
+- name: Creating pods
+  phases:
+  - namespaceRange:
+      min: 1
+      max: {{$POD_COUNT}}
+    replicasPerNamespace: 1
+    tuningSet: UniformQPS
+    objectBundle:
+    - basename: latency-pod-rc
+      objectTemplatePath: rc.yaml
+      templateFillMap:
+        Replicas: 1
+        Group: latency
+        Image: {{$CONTAINER_IMAGE}}
+- name: Waiting for pods to be running
+  measurements:
+  - Identifier: WaitForRunningLatencyRCs
+    Method: WaitForControlledPodsRunning
+    Params:
+      action: gather
+- name: Deleting pods
+  phases:
+  - namespaceRange:
+      min: 1
+      max: {{$POD_COUNT}}
+    replicasPerNamespace: 0
+    tuningSet: UniformQPS
+    objectBundle:
+    - basename: latency-pod-rc
+      objectTemplatePath: rc.yaml
+- name: Waiting for pods to be deleted
+  measurements:
+  - Identifier: WaitForRunningLatencyRCs
+    Method: WaitForControlledPodsRunning
+    Params:
+      action: gather
+# Collect measurements
+- name: Collecting pods measurements
+  measurements:
+  - Identifier: PodStartupLatency
+    Method: PodStartupLatency
+    Params:
+      action: gather
+- name: Collecting measurements
+  measurements:
+  - Identifier: APIResponsivenessPrometheusSimple
+    Method: APIResponsivenessPrometheus
+    Params:
+      action: gather
+      enableViolations: true
+      useSimpleLatencyQuery: true
+      summaryName: APIResponsivenessPrometheus_simple
diff --git a/clusterloader2/testing/pod-startup-latency/rc.yaml b/clusterloader2/testing/pod-startup-latency/rc.yaml
@@ -0,0 +1,34 @@
+apiVersion: v1
+kind: ReplicationController
+metadata:
+  name: {{.Name}}
+  labels:
+    group: {{.Group}}
+spec:
+  replicas: {{.Replicas}}
+  selector:
+    name: {{.Name}}
+  template:
+    metadata:
+      labels:
+        name: {{.Name}}
+        group: {{.Group}}
+    spec:
+      # Do automount default service account, to be more representative of real workloads.
+      automountServiceAccountToken: true
+      containers:
+      - image: {{.Image}}
+        imagePullPolicy: Never # Image is expected to be already present on the node, so we can measure pod startup latency without image pull time.
+        name: {{.Name}}
+        ports:
+      # Add not-ready/unreachable tolerations for 15 minutes so that node
+      # failure doesn't trigger pod deletion.
+      tolerations:
+      - key: "node.kubernetes.io/not-ready"
+        operator: "Exists"
+        effect: "NoExecute"
+        tolerationSeconds: 900
+      - key: "node.kubernetes.io/unreachable"
+        operator: "Exists"
+        effect: "NoExecute"
+        tolerationSeconds: 900