Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/lib/env.ts
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,12 @@ const runtimeEnvSchema = z
METRICS_COMPRESS_AFTER: z.string().default("24 hours"),
PORT: z.coerce.number().int().default(3000),

// SLO gate for canary broadening (IF-7): the aggregate canary error ratio
// (0..1) above which `broadenRollout` refuses to advance the canary and
// holds it in HEALTH_CHECK with a recorded reason. Conservative — only a
// clearly-burned budget blocks; this never triggers an auto-rollback.
VF_ROLLOUT_ERROR_BUDGET: z.coerce.number().min(0).max(1).default(0.05),

VF_DISABLE_LOCAL_AUTH: z.string().optional(),
DEV_AUTH_BYPASS: z.string().optional(),
DEV_AUTH_BYPASS_ALLOW_NETWORK: z.string().optional(),
Expand Down
64 changes: 64 additions & 0 deletions src/server/services/__tests__/staged-rollout.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,70 @@ describe("StagedRolloutService", () => {
"Staged rollout not found",
);
});

it("blocks broaden when the canary error budget is burned (records reason, status not advanced)", async () => {
prismaMock.release.findFirst.mockResolvedValue(
makeRollout({ status: "HEALTH_CHECK" }) as never,
);
// 10% error ratio (100 / 1000) — above the default 0.05 budget.
prismaMock.nodePipelineStatus.findMany.mockResolvedValue([
{ eventsIn: BigInt(1000), errorsTotal: BigInt(100) },
] as never);
prismaMock.release.update.mockResolvedValue({} as never);

await expect(service.broadenRollout("rollout-1")).rejects.toThrow(
/error budget/i,
);

// No broaden push to the remaining nodes.
expect(relayPushMock).not.toHaveBeenCalled();

// Reason recorded on the rollout; status NOT advanced to BROADENED.
expect(prismaMock.release.update).toHaveBeenCalledWith({
where: { id: "rollout-1", strategy: "CANARY" },
data: { reviewNote: expect.stringContaining("error budget") },
});
expect(prismaMock.release.update).not.toHaveBeenCalledWith(
expect.objectContaining({
data: expect.objectContaining({ status: "BROADENED" }),
}),
);

// No "broadened" SSE event fired.
expect(broadcastMock).not.toHaveBeenCalledWith(
expect.objectContaining({ action: "canary_broadened" }),
expect.anything(),
);
});

it("proceeds with broaden when the canary error ratio is within budget", async () => {
prismaMock.release.findFirst.mockResolvedValue(
makeRollout({ status: "HEALTH_CHECK" }) as never,
);
// 1% error ratio (10 / 1000) — within the default 0.05 budget.
prismaMock.nodePipelineStatus.findMany.mockResolvedValue([
{ eventsIn: BigInt(1000), errorsTotal: BigInt(10) },
] as never);
prismaMock.release.update.mockResolvedValue({} as never);
fireEventAlertMock.mockResolvedValue(undefined as never);

await service.broadenRollout("rollout-1");

// Broaden push to all 3 remaining nodes.
expect(relayPushMock).toHaveBeenCalledTimes(3);

// Status advanced to BROADENED.
expect(prismaMock.release.update).toHaveBeenCalledWith({
where: { id: "rollout-1", strategy: "CANARY" },
data: { status: "BROADENED", broadenedAt: expect.any(Date) },
});

// "broadened" SSE event fired.
expect(broadcastMock).toHaveBeenCalledWith(
expect.objectContaining({ action: "canary_broadened", pipelineId: "pipe-1" }),
"env-1",
);
});
});

// ─── rollbackRollout ────────────────────────────────────────────────
Expand Down
45 changes: 44 additions & 1 deletion src/server/services/staged-rollout.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,13 @@ import { relayPush } from "@/server/services/push-broadcast";
import { generateVectorYaml } from "@/lib/config-generator";
import { decryptNodeConfig } from "@/server/services/config-crypto";
import { parseDeploymentStrategy } from "@/lib/deployment-strategy";
import { env } from "@/lib/env";
import {
getAggregateErrorRate,
getRecentMeanLatency,
} from "@/server/services/auto-rollback";
import { TRPCError } from "@trpc/server";
import { infoLog, errorLog } from "@/lib/logger";
import { infoLog, warnLog, errorLog } from "@/lib/logger";

// ─── Constants ──────────────────────────────────────────────────────────────

Expand Down Expand Up @@ -411,6 +412,28 @@ export class StagedRolloutService {
}
}

/**
* SLO error-budget gate for broadening. Computes the canary's aggregate
* error ratio from the SAME signal the health-check uses
* (`getAggregateErrorRate`, NodePipelineStatus rows) and reports whether the
* configured budget (`VF_ROLLOUT_ERROR_BUDGET`, a 0..1 ratio) is clearly
* burned. Conservative: when no metric data exists yet (`errorRate === null`)
* the budget is treated as NOT burned, so broadening is never blocked on an
* absent signal — only on an observed, over-budget error ratio. Never
* triggers a rollback (out of scope); callers only hold the canary.
*/
private async evaluateErrorBudget(
pipelineId: string,
): Promise<{ burned: boolean; errorRatio: number | null; budget: number }> {
const budget = env.VF_ROLLOUT_ERROR_BUDGET;
const errorRatePercent = await getAggregateErrorRate(pipelineId);
if (errorRatePercent === null) {
return { burned: false, errorRatio: null, budget };
}
const errorRatio = errorRatePercent / 100;
return { burned: errorRatio > budget, errorRatio, budget };
}

/**
* Broaden a canary rollout to all remaining nodes.
* Only allowed when status is HEALTH_CHECK (health-check window has expired).
Expand All @@ -437,6 +460,26 @@ export class StagedRolloutService {
});
}

// SLO gate (IF-7): refuse to broaden a canary whose error budget is
// clearly burned. Reuses the same error-rate signal as the health-check;
// it only holds the canary in HEALTH_CHECK and records the reason — it
// never auto-rolls back (out of scope).
const budgetCheck = await this.evaluateErrorBudget(rollout.pipelineId);
if (budgetCheck.burned) {
const errorPct = ((budgetCheck.errorRatio ?? 0) * 100).toFixed(2);
const budgetPct = (budgetCheck.budget * 100).toFixed(2);
const reason = `Broaden held — canary error ratio ${errorPct}% exceeds error budget ${budgetPct}%`;
await prisma.release.update({
where: { id: rolloutId, strategy: "CANARY" },
data: { reviewNote: reason },
});
warnLog(
"staged-rollout",
`Blocked broaden of rollout ${rolloutId}: error ratio ${errorPct}% > budget ${budgetPct}%`,
);
throw new TRPCError({ code: "PRECONDITION_FAILED", message: reason });
}

// Send config_changed push to remaining nodes
const remainingNodeIds = (rollout.remainingNodeIds as string[]) ?? [];
for (const nodeId of remainingNodeIds) {
Expand Down
Loading