diff --git a/.github/workflows/PR.yaml b/.github/workflows/PR.yaml index f47e0a102..b5ca39ea8 100644 --- a/.github/workflows/PR.yaml +++ b/.github/workflows/PR.yaml @@ -34,7 +34,7 @@ jobs: with: flavor: gke-default name: infra-pr-${{ github.event.pull_request.number }} - args: machine-type=e2-standard-4,nodes=3,gcp-image-type=ubuntu_containerd + args: machine-type=e2-standard-4,nodes=3,gcp-image-type=ubuntu_containerd,has-infra-server=true lifespan: ${{ github.actor == 'dependabot[bot]' && '1h' || '24h' }} wait: true token: ${{ secrets.INFRA_TOKEN }} diff --git a/Makefile b/Makefile index 3e948eb93..faedb1397 100644 --- a/Makefile +++ b/Makefile @@ -175,7 +175,7 @@ push: .PHONY: argo-workflow-lint argo-workflow-lint: - @argo lint ./chart/infra-server/static/workflow*.yaml + @argo lint --offline ./chart/infra-server/static/workflow*.yaml ./chart/infra-server/templates/workflowtemplates/*.yaml .PHONY: shellcheck shellcheck: diff --git a/chart/infra-server/static/flavors.yaml b/chart/infra-server/static/flavors.yaml index a73895f8c..9e209fc67 100644 --- a/chart/infra-server/static/flavors.yaml +++ b/chart/infra-server/static/flavors.yaml @@ -194,6 +194,11 @@ value: false kind: optional + - name: has-infra-server + description: Whether this cluster runs an infra-server instance. Enables pre-destroy cleanup of managed clusters. + value: false + kind: optional + artifacts: - name: kubeconfig description: Kube config for connecting to this cluster diff --git a/chart/infra-server/static/workflow-gke-default.yaml b/chart/infra-server/static/workflow-gke-default.yaml index 4be3a1203..816fd0f0a 100644 --- a/chart/infra-server/static/workflow-gke-default.yaml +++ b/chart/infra-server/static/workflow-gke-default.yaml @@ -2,6 +2,7 @@ apiVersion: argoproj.io/v1alpha1 kind: Workflow metadata: generateName: gke-default- + namespace: default spec: entrypoint: start onExit: stop @@ -20,6 +21,8 @@ spec: value: "" - name: set-ssd-storage-default value: "" + - name: has-infra-server + value: "false" volumes: - name: credentials @@ -36,6 +39,23 @@ spec: - name: stop steps: + # Stops all Argo workflows on a target cluster before it is destroyed, + # giving each workflow's onExit handler time to clean up cloud resources. + # Only relevant for GKE clusters that run the infra server. + - - name: cleanup-infra-clusters + templateRef: + name: cleanup-infra-clusters + template: cleanup + arguments: + artifacts: + - name: kubeconfig + from: '{{ "{{" }}workflow.outputs.artifacts.global-kubeconfig{{ "}}" }}' + optional: true + when: '{{ "{{" }}workflow.parameters.has-infra-server{{ "}}" }} == true' + continueOn: + failed: true + error: true + - - name: destroy template: destroy arguments: @@ -48,8 +68,10 @@ spec: outputs: artifacts: - name: kubeconfig + globalName: global-kubeconfig path: /outputs/kubeconfig mode: 0644 + optional: true archive: none: {} - name: connect diff --git a/chart/infra-server/templates/workflowtemplates/cleanup-infra-clusters.yaml b/chart/infra-server/templates/workflowtemplates/cleanup-infra-clusters.yaml new file mode 100644 index 000000000..97f598428 --- /dev/null +++ b/chart/infra-server/templates/workflowtemplates/cleanup-infra-clusters.yaml @@ -0,0 +1,62 @@ +--- +apiVersion: argoproj.io/v1alpha1 +kind: WorkflowTemplate +metadata: + name: cleanup-infra-clusters + namespace: default +spec: + templates: + # Stops all infra-managed Argo workflows on a target cluster before it + # is destroyed, giving each workflow's onExit handler time to clean up + # cloud resources. + - name: cleanup + inputs: + artifacts: + - name: kubeconfig + path: /tmp/kubeconfig + optional: true + activeDeadlineSeconds: 3600 + script: + image: quay.io/argoproj/argocli:latest + command: [bash] + source: | + set -uo pipefail + + if [ ! -f /tmp/kubeconfig ]; then + echo "No kubeconfig artifact available. Skipping cleanup." + exit 0 + fi + + export KUBECONFIG=/tmp/kubeconfig + export ARGO_NAMESPACE=default + + get_active_workflows() { + argo list --status Running,Pending -l infra.stackrox.com/cluster-id -o name + } + + ACTIVE=$(get_active_workflows) + if [ -z "$ACTIVE" ]; then + echo "No active workflows found. Skipping cleanup." + exit 0 + fi + + echo "Stopping all active workflows to trigger their destroy phases." + for wf in $ACTIVE; do + echo "Stopping workflow: ${wf}" + argo stop "$wf" || true + done + + DEADLINE=$((SECONDS + 3000)) + while [ $SECONDS -lt $DEADLINE ]; do + REMAINING=$(get_active_workflows) + if [ -z "$REMAINING" ]; then + echo "All managed workflows completed." + exit 0 + fi + COUNT=$(echo "$REMAINING" | wc -l | tr -d ' ') + echo "Waiting for ${COUNT} workflow(s) to complete..." + sleep 30 + done + + echo "Timed out waiting for managed workflows. Proceeding with cluster destroy." + exit 1