diff --git a/src/lib/env.ts b/src/lib/env.ts index cbd2ebee..3f293c69 100644 --- a/src/lib/env.ts +++ b/src/lib/env.ts @@ -63,6 +63,12 @@ const runtimeEnvSchema = z METRICS_COMPRESS_AFTER: z.string().default("24 hours"), PORT: z.coerce.number().int().default(3000), + // SLO gate for canary broadening (IF-7): the aggregate canary error ratio + // (0..1) above which `broadenRollout` refuses to advance the canary and + // holds it in HEALTH_CHECK with a recorded reason. Conservative — only a + // clearly-burned budget blocks; this never triggers an auto-rollback. + VF_ROLLOUT_ERROR_BUDGET: z.coerce.number().min(0).max(1).default(0.05), + VF_DISABLE_LOCAL_AUTH: z.string().optional(), DEV_AUTH_BYPASS: z.string().optional(), DEV_AUTH_BYPASS_ALLOW_NETWORK: z.string().optional(), diff --git a/src/server/services/__tests__/staged-rollout.test.ts b/src/server/services/__tests__/staged-rollout.test.ts index 1c624800..09f1057c 100644 --- a/src/server/services/__tests__/staged-rollout.test.ts +++ b/src/server/services/__tests__/staged-rollout.test.ts @@ -380,6 +380,70 @@ describe("StagedRolloutService", () => { "Staged rollout not found", ); }); + + it("blocks broaden when the canary error budget is burned (records reason, status not advanced)", async () => { + prismaMock.release.findFirst.mockResolvedValue( + makeRollout({ status: "HEALTH_CHECK" }) as never, + ); + // 10% error ratio (100 / 1000) — above the default 0.05 budget. + prismaMock.nodePipelineStatus.findMany.mockResolvedValue([ + { eventsIn: BigInt(1000), errorsTotal: BigInt(100) }, + ] as never); + prismaMock.release.update.mockResolvedValue({} as never); + + await expect(service.broadenRollout("rollout-1")).rejects.toThrow( + /error budget/i, + ); + + // No broaden push to the remaining nodes. + expect(relayPushMock).not.toHaveBeenCalled(); + + // Reason recorded on the rollout; status NOT advanced to BROADENED. + expect(prismaMock.release.update).toHaveBeenCalledWith({ + where: { id: "rollout-1", strategy: "CANARY" }, + data: { reviewNote: expect.stringContaining("error budget") }, + }); + expect(prismaMock.release.update).not.toHaveBeenCalledWith( + expect.objectContaining({ + data: expect.objectContaining({ status: "BROADENED" }), + }), + ); + + // No "broadened" SSE event fired. + expect(broadcastMock).not.toHaveBeenCalledWith( + expect.objectContaining({ action: "canary_broadened" }), + expect.anything(), + ); + }); + + it("proceeds with broaden when the canary error ratio is within budget", async () => { + prismaMock.release.findFirst.mockResolvedValue( + makeRollout({ status: "HEALTH_CHECK" }) as never, + ); + // 1% error ratio (10 / 1000) — within the default 0.05 budget. + prismaMock.nodePipelineStatus.findMany.mockResolvedValue([ + { eventsIn: BigInt(1000), errorsTotal: BigInt(10) }, + ] as never); + prismaMock.release.update.mockResolvedValue({} as never); + fireEventAlertMock.mockResolvedValue(undefined as never); + + await service.broadenRollout("rollout-1"); + + // Broaden push to all 3 remaining nodes. + expect(relayPushMock).toHaveBeenCalledTimes(3); + + // Status advanced to BROADENED. + expect(prismaMock.release.update).toHaveBeenCalledWith({ + where: { id: "rollout-1", strategy: "CANARY" }, + data: { status: "BROADENED", broadenedAt: expect.any(Date) }, + }); + + // "broadened" SSE event fired. + expect(broadcastMock).toHaveBeenCalledWith( + expect.objectContaining({ action: "canary_broadened", pipelineId: "pipe-1" }), + "env-1", + ); + }); }); // ─── rollbackRollout ──────────────────────────────────────────────── diff --git a/src/server/services/staged-rollout.ts b/src/server/services/staged-rollout.ts index d8cf4d28..61ce3fb7 100644 --- a/src/server/services/staged-rollout.ts +++ b/src/server/services/staged-rollout.ts @@ -8,12 +8,13 @@ import { relayPush } from "@/server/services/push-broadcast"; import { generateVectorYaml } from "@/lib/config-generator"; import { decryptNodeConfig } from "@/server/services/config-crypto"; import { parseDeploymentStrategy } from "@/lib/deployment-strategy"; +import { env } from "@/lib/env"; import { getAggregateErrorRate, getRecentMeanLatency, } from "@/server/services/auto-rollback"; import { TRPCError } from "@trpc/server"; -import { infoLog, errorLog } from "@/lib/logger"; +import { infoLog, warnLog, errorLog } from "@/lib/logger"; // ─── Constants ────────────────────────────────────────────────────────────── @@ -411,6 +412,28 @@ export class StagedRolloutService { } } + /** + * SLO error-budget gate for broadening. Computes the canary's aggregate + * error ratio from the SAME signal the health-check uses + * (`getAggregateErrorRate`, NodePipelineStatus rows) and reports whether the + * configured budget (`VF_ROLLOUT_ERROR_BUDGET`, a 0..1 ratio) is clearly + * burned. Conservative: when no metric data exists yet (`errorRate === null`) + * the budget is treated as NOT burned, so broadening is never blocked on an + * absent signal — only on an observed, over-budget error ratio. Never + * triggers a rollback (out of scope); callers only hold the canary. + */ + private async evaluateErrorBudget( + pipelineId: string, + ): Promise<{ burned: boolean; errorRatio: number | null; budget: number }> { + const budget = env.VF_ROLLOUT_ERROR_BUDGET; + const errorRatePercent = await getAggregateErrorRate(pipelineId); + if (errorRatePercent === null) { + return { burned: false, errorRatio: null, budget }; + } + const errorRatio = errorRatePercent / 100; + return { burned: errorRatio > budget, errorRatio, budget }; + } + /** * Broaden a canary rollout to all remaining nodes. * Only allowed when status is HEALTH_CHECK (health-check window has expired). @@ -437,6 +460,26 @@ export class StagedRolloutService { }); } + // SLO gate (IF-7): refuse to broaden a canary whose error budget is + // clearly burned. Reuses the same error-rate signal as the health-check; + // it only holds the canary in HEALTH_CHECK and records the reason — it + // never auto-rolls back (out of scope). + const budgetCheck = await this.evaluateErrorBudget(rollout.pipelineId); + if (budgetCheck.burned) { + const errorPct = ((budgetCheck.errorRatio ?? 0) * 100).toFixed(2); + const budgetPct = (budgetCheck.budget * 100).toFixed(2); + const reason = `Broaden held — canary error ratio ${errorPct}% exceeds error budget ${budgetPct}%`; + await prisma.release.update({ + where: { id: rolloutId, strategy: "CANARY" }, + data: { reviewNote: reason }, + }); + warnLog( + "staged-rollout", + `Blocked broaden of rollout ${rolloutId}: error ratio ${errorPct}% > budget ${budgetPct}%`, + ); + throw new TRPCError({ code: "PRECONDITION_FAILED", message: reason }); + } + // Send config_changed push to remaining nodes const remainingNodeIds = (rollout.remainingNodeIds as string[]) ?? []; for (const nodeId of remainingNodeIds) {