From ca22df428f633474a097a6116588138fde267e1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stephan=20He=C3=9Felmann?= Date: Tue, 23 Jun 2026 01:31:58 +0200 Subject: [PATCH] ROX-35269: clean up managed clusters before destroying infra-pr GKE clusters When a GKE cluster running a test infra-server is destroyed, any test clusters it manages are orphaned because their Argo destroy phases never run. Add a pre-destroy cleanup step that connects to the target cluster, stops all active Argo workflows (triggering their onExit handlers), and waits for cloud resource cleanup to complete before tearing down the cluster. The cleanup is gated by a new has-infra-server workflow parameter (default false) so it only runs on clusters that host an infra-server instance. PR.yaml passes has-infra-server=true for infra-pr clusters. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/PR.yaml | 2 +- Makefile | 2 +- chart/infra-server/static/flavors.yaml | 5 ++ .../static/workflow-gke-default.yaml | 22 +++++++ .../cleanup-infra-clusters.yaml | 62 +++++++++++++++++++ 5 files changed, 91 insertions(+), 2 deletions(-) create mode 100644 chart/infra-server/templates/workflowtemplates/cleanup-infra-clusters.yaml diff --git a/.github/workflows/PR.yaml b/.github/workflows/PR.yaml index f47e0a102..b5ca39ea8 100644 --- a/.github/workflows/PR.yaml +++ b/.github/workflows/PR.yaml @@ -34,7 +34,7 @@ jobs: with: flavor: gke-default name: infra-pr-${{ github.event.pull_request.number }} - args: machine-type=e2-standard-4,nodes=3,gcp-image-type=ubuntu_containerd + args: machine-type=e2-standard-4,nodes=3,gcp-image-type=ubuntu_containerd,has-infra-server=true lifespan: ${{ github.actor == 'dependabot[bot]' && '1h' || '24h' }} wait: true token: ${{ secrets.INFRA_TOKEN }} diff --git a/Makefile b/Makefile index 3e948eb93..faedb1397 100644 --- a/Makefile +++ b/Makefile @@ -175,7 +175,7 @@ push: .PHONY: argo-workflow-lint argo-workflow-lint: - @argo lint ./chart/infra-server/static/workflow*.yaml + @argo lint --offline ./chart/infra-server/static/workflow*.yaml ./chart/infra-server/templates/workflowtemplates/*.yaml .PHONY: shellcheck shellcheck: diff --git a/chart/infra-server/static/flavors.yaml b/chart/infra-server/static/flavors.yaml index a73895f8c..9e209fc67 100644 --- a/chart/infra-server/static/flavors.yaml +++ b/chart/infra-server/static/flavors.yaml @@ -194,6 +194,11 @@ value: false kind: optional + - name: has-infra-server + description: Whether this cluster runs an infra-server instance. Enables pre-destroy cleanup of managed clusters. + value: false + kind: optional + artifacts: - name: kubeconfig description: Kube config for connecting to this cluster diff --git a/chart/infra-server/static/workflow-gke-default.yaml b/chart/infra-server/static/workflow-gke-default.yaml index 4be3a1203..816fd0f0a 100644 --- a/chart/infra-server/static/workflow-gke-default.yaml +++ b/chart/infra-server/static/workflow-gke-default.yaml @@ -2,6 +2,7 @@ apiVersion: argoproj.io/v1alpha1 kind: Workflow metadata: generateName: gke-default- + namespace: default spec: entrypoint: start onExit: stop @@ -20,6 +21,8 @@ spec: value: "" - name: set-ssd-storage-default value: "" + - name: has-infra-server + value: "false" volumes: - name: credentials @@ -36,6 +39,23 @@ spec: - name: stop steps: + # Stops all Argo workflows on a target cluster before it is destroyed, + # giving each workflow's onExit handler time to clean up cloud resources. + # Only relevant for GKE clusters that run the infra server. + - - name: cleanup-infra-clusters + templateRef: + name: cleanup-infra-clusters + template: cleanup + arguments: + artifacts: + - name: kubeconfig + from: '{{ "{{" }}workflow.outputs.artifacts.global-kubeconfig{{ "}}" }}' + optional: true + when: '{{ "{{" }}workflow.parameters.has-infra-server{{ "}}" }} == true' + continueOn: + failed: true + error: true + - - name: destroy template: destroy arguments: @@ -48,8 +68,10 @@ spec: outputs: artifacts: - name: kubeconfig + globalName: global-kubeconfig path: /outputs/kubeconfig mode: 0644 + optional: true archive: none: {} - name: connect diff --git a/chart/infra-server/templates/workflowtemplates/cleanup-infra-clusters.yaml b/chart/infra-server/templates/workflowtemplates/cleanup-infra-clusters.yaml new file mode 100644 index 000000000..97f598428 --- /dev/null +++ b/chart/infra-server/templates/workflowtemplates/cleanup-infra-clusters.yaml @@ -0,0 +1,62 @@ +--- +apiVersion: argoproj.io/v1alpha1 +kind: WorkflowTemplate +metadata: + name: cleanup-infra-clusters + namespace: default +spec: + templates: + # Stops all infra-managed Argo workflows on a target cluster before it + # is destroyed, giving each workflow's onExit handler time to clean up + # cloud resources. + - name: cleanup + inputs: + artifacts: + - name: kubeconfig + path: /tmp/kubeconfig + optional: true + activeDeadlineSeconds: 3600 + script: + image: quay.io/argoproj/argocli:latest + command: [bash] + source: | + set -uo pipefail + + if [ ! -f /tmp/kubeconfig ]; then + echo "No kubeconfig artifact available. Skipping cleanup." + exit 0 + fi + + export KUBECONFIG=/tmp/kubeconfig + export ARGO_NAMESPACE=default + + get_active_workflows() { + argo list --status Running,Pending -l infra.stackrox.com/cluster-id -o name + } + + ACTIVE=$(get_active_workflows) + if [ -z "$ACTIVE" ]; then + echo "No active workflows found. Skipping cleanup." + exit 0 + fi + + echo "Stopping all active workflows to trigger their destroy phases." + for wf in $ACTIVE; do + echo "Stopping workflow: ${wf}" + argo stop "$wf" || true + done + + DEADLINE=$((SECONDS + 3000)) + while [ $SECONDS -lt $DEADLINE ]; do + REMAINING=$(get_active_workflows) + if [ -z "$REMAINING" ]; then + echo "All managed workflows completed." + exit 0 + fi + COUNT=$(echo "$REMAINING" | wc -l | tr -d ' ') + echo "Waiting for ${COUNT} workflow(s) to complete..." + sleep 30 + done + + echo "Timed out waiting for managed workflows. Proceeding with cluster destroy." + exit 1