Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions controllers/actions.github.com/ephemeralrunner_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,12 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
)
return ctrl.Result{}, r.deleteEphemeralRunnerOrPod(ctx, ephemeralRunner, pod, log)

case initContainerFailed(pod):
log.Info("Pod has a failed init container, deleting pod as failed so it can be restarted",
"initContainerStatuses", pod.Status.InitContainerStatuses,
)
return ctrl.Result{}, r.deleteEphemeralRunnerOrPod(ctx, ephemeralRunner, pod, log)
Comment on lines +354 to +358
Copy link

Copilot AI Apr 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the init-container-failure branch, deletePodAsFailed will copy pod.Status.Reason/pod.Status.Message onto the EphemeralRunner status. For Pending pods with init container termination details, those fields are often empty (the detail lives on the init container status), which reduces observability and can overwrite any existing status message with "". Consider deriving a reason/message from the failing init container status (reason/message/exitCode) and surfacing that on the EphemeralRunner before deleting/recreating the pod.

Copilot uses AI. Check for mistakes.

case cs == nil:
// starting, no container state yet
log.Info("Waiting for runner container status to be available")
Expand Down Expand Up @@ -862,3 +868,13 @@ func runnerContainerStatus(pod *corev1.Pod) *corev1.ContainerStatus {
}
return nil
}

func initContainerFailed(pod *corev1.Pod) bool {
for i := range pod.Status.InitContainerStatuses {
cs := &pod.Status.InitContainerStatuses[i]
if cs.State.Terminated != nil && cs.State.Terminated.ExitCode != 0 {
return true
}
Copy link

Copilot AI Apr 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

initContainerFailed only checks cs.State.Terminated. In common kubelet failure modes (e.g., init container CrashLoopBackOff / restart backoff), the current state is Waiting and the failure is recorded under cs.LastTerminationState.Terminated. In that case this helper will return false and the reconciler can still fall into the cs == nil branch, leaving the EphemeralRunner stuck. Consider also inspecting LastTerminationState.Terminated (ideally gated on cs.State.Waiting != nil) so restarts are triggered for restart/backoff scenarios too.

Suggested change
}
}
if cs.State.Waiting != nil &&
cs.LastTerminationState.Terminated != nil &&
cs.LastTerminationState.Terminated.ExitCode != 0 {
return true
}

Copilot uses AI. Check for mistakes.
}
return false
}
109 changes: 109 additions & 0 deletions controllers/actions.github.com/ephemeralrunner_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,115 @@ var _ = Describe("EphemeralRunner", func() {
).Should(BeTrue(), "Pod should be re-created")
})

It("It should re-create pod when init container fails before pod phase transitions to Failed", func() {
pod := new(corev1.Pod)
Eventually(func() (bool, error) {
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil {
return false, err
}
return true, nil
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo(true))

oldPodUID := pod.UID

// Simulate init container failure without PodFailed phase.
// This can happen when the kubelet has not yet transitioned the pod phase.
pod.Status.Phase = corev1.PodPending
pod.Status.InitContainerStatuses = []corev1.ContainerStatus{
{
Name: "setup",
State: corev1.ContainerState{
Terminated: &corev1.ContainerStateTerminated{
ExitCode: 1,
Reason: "StartError",
Message: "failed to create containerd task: context canceled",
},
},
},
}
Comment on lines +369 to +383
Copy link

Copilot AI Apr 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new tests only cover the init container being in State.Terminated with a non-zero exit code. If the real-world stuck scenario manifests as State.Waiting with the failure captured in LastTerminationState.Terminated (restart/backoff), this change won’t be exercised. Consider adding a test case that sets InitContainerStatuses to Waiting + LastTerminationState.Terminated (non-zero) to ensure the reconciler takes the init-failure branch in that mode too.

Copilot uses AI. Check for mistakes.
err := k8sClient.Status().Update(ctx, pod)
Expect(err).To(BeNil(), "Failed to update pod status")

Eventually(
func() (int, error) {
updated := new(v1alpha1.EphemeralRunner)
err := k8sClient.Get(
ctx,
client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace},
updated,
)
if err != nil {
return 0, err
}
return len(updated.Status.Failures), nil
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(BeEquivalentTo(1))

Eventually(
func() (bool, error) {
newPod := new(corev1.Pod)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, newPod)
if err != nil {
return false, err
}
return newPod.UID != oldPodUID, nil
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(BeTrue(), "Pod should be re-created after init container failure")
})

It("It should delete ephemeral runner when init container fails and job is assigned", func() {
er := new(v1alpha1.EphemeralRunner)
Eventually(func() error {
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner")

er.Status.JobID = "1"
err := k8sClient.Status().Update(ctx, er)
Expect(err).To(BeNil(), "failed to update ephemeral runner status")

Eventually(func() (string, error) {
current := new(v1alpha1.EphemeralRunner)
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, current); err != nil {
return "", err
}
return current.Status.JobID, nil
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo("1"))

pod := new(corev1.Pod)
Eventually(func() (bool, error) {
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil {
return false, err
}
return true, nil
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo(true))

// Simulate init container failure with job assigned
pod.Status.Phase = corev1.PodPending
pod.Status.InitContainerStatuses = []corev1.ContainerStatus{
{
Name: "setup",
State: corev1.ContainerState{
Terminated: &corev1.ContainerStateTerminated{
ExitCode: 1,
Reason: "StartError",
},
},
},
}
err = k8sClient.Status().Update(ctx, pod)
Expect(err).To(BeNil(), "Failed to update pod status")

Eventually(func() bool {
check := new(v1alpha1.EphemeralRunner)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check)
return kerrors.IsNotFound(err)
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeTrue(), "Ephemeral runner should eventually be deleted when init container fails with job assigned")
})

It("It should treat pod failed with runner container exit 0 as success with job id", func() {
er := new(v1alpha1.EphemeralRunner)
Eventually(func() error {
Expand Down
Loading