-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Fix: Detect init container failure in EphemeralRunner controller #4457
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -351,6 +351,12 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ | |||||||||||||||||
| ) | ||||||||||||||||||
| return ctrl.Result{}, r.deleteEphemeralRunnerOrPod(ctx, ephemeralRunner, pod, log) | ||||||||||||||||||
|
|
||||||||||||||||||
| case initContainerFailed(pod): | ||||||||||||||||||
| log.Info("Pod has a failed init container, deleting pod as failed so it can be restarted", | ||||||||||||||||||
| "initContainerStatuses", pod.Status.InitContainerStatuses, | ||||||||||||||||||
| ) | ||||||||||||||||||
| return ctrl.Result{}, r.deleteEphemeralRunnerOrPod(ctx, ephemeralRunner, pod, log) | ||||||||||||||||||
|
|
||||||||||||||||||
| case cs == nil: | ||||||||||||||||||
| // starting, no container state yet | ||||||||||||||||||
| log.Info("Waiting for runner container status to be available") | ||||||||||||||||||
|
|
@@ -862,3 +868,13 @@ func runnerContainerStatus(pod *corev1.Pod) *corev1.ContainerStatus { | |||||||||||||||||
| } | ||||||||||||||||||
| return nil | ||||||||||||||||||
| } | ||||||||||||||||||
|
|
||||||||||||||||||
| func initContainerFailed(pod *corev1.Pod) bool { | ||||||||||||||||||
| for i := range pod.Status.InitContainerStatuses { | ||||||||||||||||||
| cs := &pod.Status.InitContainerStatuses[i] | ||||||||||||||||||
| if cs.State.Terminated != nil && cs.State.Terminated.ExitCode != 0 { | ||||||||||||||||||
| return true | ||||||||||||||||||
| } | ||||||||||||||||||
|
||||||||||||||||||
| } | |
| } | |
| if cs.State.Waiting != nil && | |
| cs.LastTerminationState.Terminated != nil && | |
| cs.LastTerminationState.Terminated.ExitCode != 0 { | |
| return true | |
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -355,6 +355,115 @@ var _ = Describe("EphemeralRunner", func() { | |
| ).Should(BeTrue(), "Pod should be re-created") | ||
| }) | ||
|
|
||
| It("It should re-create pod when init container fails before pod phase transitions to Failed", func() { | ||
| pod := new(corev1.Pod) | ||
| Eventually(func() (bool, error) { | ||
| if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil { | ||
| return false, err | ||
| } | ||
| return true, nil | ||
| }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo(true)) | ||
|
|
||
| oldPodUID := pod.UID | ||
|
|
||
| // Simulate init container failure without PodFailed phase. | ||
| // This can happen when the kubelet has not yet transitioned the pod phase. | ||
| pod.Status.Phase = corev1.PodPending | ||
| pod.Status.InitContainerStatuses = []corev1.ContainerStatus{ | ||
| { | ||
| Name: "setup", | ||
| State: corev1.ContainerState{ | ||
| Terminated: &corev1.ContainerStateTerminated{ | ||
| ExitCode: 1, | ||
| Reason: "StartError", | ||
| Message: "failed to create containerd task: context canceled", | ||
| }, | ||
| }, | ||
| }, | ||
| } | ||
|
Comment on lines
+369
to
+383
|
||
| err := k8sClient.Status().Update(ctx, pod) | ||
| Expect(err).To(BeNil(), "Failed to update pod status") | ||
|
|
||
| Eventually( | ||
| func() (int, error) { | ||
| updated := new(v1alpha1.EphemeralRunner) | ||
| err := k8sClient.Get( | ||
| ctx, | ||
| client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, | ||
| updated, | ||
| ) | ||
| if err != nil { | ||
| return 0, err | ||
| } | ||
| return len(updated.Status.Failures), nil | ||
| }, | ||
| ephemeralRunnerTimeout, | ||
| ephemeralRunnerInterval, | ||
| ).Should(BeEquivalentTo(1)) | ||
|
|
||
| Eventually( | ||
| func() (bool, error) { | ||
| newPod := new(corev1.Pod) | ||
| err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, newPod) | ||
| if err != nil { | ||
| return false, err | ||
| } | ||
| return newPod.UID != oldPodUID, nil | ||
| }, | ||
| ephemeralRunnerTimeout, | ||
| ephemeralRunnerInterval, | ||
| ).Should(BeTrue(), "Pod should be re-created after init container failure") | ||
| }) | ||
|
|
||
| It("It should delete ephemeral runner when init container fails and job is assigned", func() { | ||
| er := new(v1alpha1.EphemeralRunner) | ||
| Eventually(func() error { | ||
| return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er) | ||
| }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner") | ||
|
|
||
| er.Status.JobID = "1" | ||
| err := k8sClient.Status().Update(ctx, er) | ||
| Expect(err).To(BeNil(), "failed to update ephemeral runner status") | ||
|
|
||
| Eventually(func() (string, error) { | ||
| current := new(v1alpha1.EphemeralRunner) | ||
| if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, current); err != nil { | ||
| return "", err | ||
| } | ||
| return current.Status.JobID, nil | ||
| }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo("1")) | ||
|
|
||
| pod := new(corev1.Pod) | ||
| Eventually(func() (bool, error) { | ||
| if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil { | ||
| return false, err | ||
| } | ||
| return true, nil | ||
| }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo(true)) | ||
|
|
||
| // Simulate init container failure with job assigned | ||
| pod.Status.Phase = corev1.PodPending | ||
| pod.Status.InitContainerStatuses = []corev1.ContainerStatus{ | ||
| { | ||
| Name: "setup", | ||
| State: corev1.ContainerState{ | ||
| Terminated: &corev1.ContainerStateTerminated{ | ||
| ExitCode: 1, | ||
| Reason: "StartError", | ||
| }, | ||
| }, | ||
| }, | ||
| } | ||
| err = k8sClient.Status().Update(ctx, pod) | ||
| Expect(err).To(BeNil(), "Failed to update pod status") | ||
|
|
||
| Eventually(func() bool { | ||
| check := new(v1alpha1.EphemeralRunner) | ||
| err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check) | ||
| return kerrors.IsNotFound(err) | ||
| }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeTrue(), "Ephemeral runner should eventually be deleted when init container fails with job assigned") | ||
| }) | ||
|
|
||
| It("It should treat pod failed with runner container exit 0 as success with job id", func() { | ||
| er := new(v1alpha1.EphemeralRunner) | ||
| Eventually(func() error { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In the init-container-failure branch,
deletePodAsFailedwill copypod.Status.Reason/pod.Status.Messageonto the EphemeralRunner status. For Pending pods with init container termination details, those fields are often empty (the detail lives on the init container status), which reduces observability and can overwrite any existing status message with "". Consider deriving a reason/message from the failing init container status (reason/message/exitCode) and surfacing that on the EphemeralRunner before deleting/recreating the pod.