From 9ac37f2918820b2d910722a999b0e4869b65a82d Mon Sep 17 00:00:00 2001 From: TerrifiedBug Date: Mon, 8 Jun 2026 14:36:13 +0100 Subject: [PATCH 1/2] feat(host-to-org): route verified custom domains to their org Resolve the org for a non-subdomain Host via a verified OrganizationDomainClaim (verifiedAt not null) whose domain equals the full hostname, falling back to DEFAULT_ORG_ID only when no verified claim matches. The existing .vectorflow.sh subdomain path and slug grammar are preserved and still take precedence. Add a short (30s) in-process TTL cache keyed by hostname (positive + negative) so custom-domain hosts don't pay a DB round-trip per request, with oldest-first eviction to bound memory against abusive Host headers. The custom-domain lookup is a DB read, consumed only by the Node auth layer (src/auth.ts per-org NextAuth/OIDC, SCIM auth). The edge middleware (src/proxy.ts) deliberately does not call this resolver and stays on the subdomain/auth-gate path, so no DB access is added at the edge. Boundary documented in code. No migration: @@index([domain]) and the partial unique index on verified rows already cover the equality probe. --- src/lib/__tests__/host-to-org.test.ts | 71 ++++++++++++++ src/lib/host-to-org.ts | 136 +++++++++++++++++++++++--- 2 files changed, 193 insertions(+), 14 deletions(-) diff --git a/src/lib/__tests__/host-to-org.test.ts b/src/lib/__tests__/host-to-org.test.ts index f2810242..2ada6483 100644 --- a/src/lib/__tests__/host-to-org.test.ts +++ b/src/lib/__tests__/host-to-org.test.ts @@ -12,6 +12,7 @@ import { extractSlugFromHost, normalizeHost, resolveOrgIdFromHost, + _resetHostOrgCacheForTests, } from "@/lib/host-to-org"; import { DEFAULT_ORG_ID } from "@/lib/org-constants"; @@ -19,6 +20,7 @@ const prismaMock = prisma as unknown as DeepMockProxy; beforeEach(() => { mockReset(prismaMock); + _resetHostOrgCacheForTests(); }); afterEach(() => { @@ -134,4 +136,73 @@ describe("resolveOrgIdFromHost", () => { "org-b-uuid", ); }); + + it("resolves a verified claim for a custom domain to its org", async () => { + // "logs" is a syntactically valid slug, so the resolver tries the + // subdomain path first (miss) before matching the verified claim. + prismaMock.organization.findUnique.mockResolvedValue(null); + prismaMock.organizationDomainClaim.findFirst.mockResolvedValue({ + organizationId: "org-acme-uuid", + } as never); + + await expect(resolveOrgIdFromHost("logs.acme.com")).resolves.toBe( + "org-acme-uuid", + ); + expect(prismaMock.organizationDomainClaim.findFirst).toHaveBeenCalledWith({ + where: { domain: "logs.acme.com", verifiedAt: { not: null } }, + select: { organizationId: true }, + }); + }); + + it("does NOT resolve an unverified claim (falls back to DEFAULT_ORG_ID)", async () => { + prismaMock.organization.findUnique.mockResolvedValue(null); + // Simulate the DB: a claim row exists for the domain but is unverified, so + // a query filtering `verifiedAt: { not: null }` returns nothing. Proves the + // resolver only ever honours verified claims. + prismaMock.organizationDomainClaim.findFirst.mockImplementation(((args: { + where: { verifiedAt?: { not: null } }; + }) => + Promise.resolve( + args.where.verifiedAt?.not === null + ? null + : { organizationId: "org-acme-uuid" }, + )) as never); + + await expect(resolveOrgIdFromHost("logs.acme.com")).resolves.toBe( + DEFAULT_ORG_ID, + ); + expect(prismaMock.organizationDomainClaim.findFirst).toHaveBeenCalledWith( + expect.objectContaining({ + where: { domain: "logs.acme.com", verifiedAt: { not: null } }, + }), + ); + }); + + it("falls back to DEFAULT_ORG_ID for an unknown custom domain", async () => { + prismaMock.organization.findUnique.mockResolvedValue(null); + prismaMock.organizationDomainClaim.findFirst.mockResolvedValue(null); + + await expect(resolveOrgIdFromHost("unknown.example.com")).resolves.toBe( + DEFAULT_ORG_ID, + ); + }); + + it("caches the result so a repeated host does not hit the DB twice", async () => { + prismaMock.organization.findUnique.mockResolvedValue(null); + prismaMock.organizationDomainClaim.findFirst.mockResolvedValue({ + organizationId: "org-acme-uuid", + } as never); + + await expect(resolveOrgIdFromHost("logs.acme.com")).resolves.toBe( + "org-acme-uuid", + ); + await expect(resolveOrgIdFromHost("logs.acme.com")).resolves.toBe( + "org-acme-uuid", + ); + // Second call is served from the in-process TTL cache: no extra round-trip. + expect(prismaMock.organizationDomainClaim.findFirst).toHaveBeenCalledTimes( + 1, + ); + expect(prismaMock.organization.findUnique).toHaveBeenCalledTimes(1); + }); }); diff --git a/src/lib/host-to-org.ts b/src/lib/host-to-org.ts index f33135af..331095b1 100644 --- a/src/lib/host-to-org.ts +++ b/src/lib/host-to-org.ts @@ -13,6 +13,13 @@ * does NOT include an org-slug prefix. For those, this helper falls back * to `DEFAULT_ORG_ID` so OSS users see no behaviour change. * + * Custom domains (e.g. `logs.acme.com` CNAME'd at the platform) are routed + * via a *verified* `OrganizationDomainClaim` (DNS-TXT ownership). When the + * host is not a recognised `` subdomain, a verified claim whose + * `domain` equals the full host wins; otherwise we still fall back to + * `DEFAULT_ORG_ID`. That lookup is a DB read, so it is Node-runtime only + * (see the boundary note on `resolveOrgIdFromHost`). + * * The slug grammar is the same one used by enrollment tokens * (`isValidOrgSlug`): lowercase letters/digits/hyphens, 3–31 chars, * starts with a letter. This shared grammar is what makes @@ -58,29 +65,130 @@ export function extractSlugFromHost(normalizedHost: string): string | null { } /** - * Map a raw `Host:` header value to an organisation id. Returns - * `DEFAULT_ORG_ID` for OSS hosts, missing hosts, or slugs that don't - * exist in the DB. This intentionally fails open to the default org — - * cross-org leakage from the wrong direction (e.g. an attacker spoofing - * a slug) is prevented by RLS + per-org JWT secrets, not by this lookup. + * In-process TTL cache for host→org resolution. Both positive (a real org) + * and negative (`DEFAULT_ORG_ID`) results are cached so a custom-domain + * request does not pay a DB round-trip on every call. The TTL is deliberately + * short: it bounds how long a freshly-verified (or removed) + * `OrganizationDomainClaim` takes to start (or stop) routing, and it is the + * only staleness bound that holds across multiple server instances. Transient + * DB errors are NOT cached (we fail open but retry on the next request). */ -export async function resolveOrgIdFromHost( - host: string | null | undefined, +const HOST_ORG_CACHE_TTL_MS = 30_000; +/** + * Hard cap on cache entries; the oldest are evicted first to bound memory when + * unknown hosts (or abusive `Host:` headers) are probed. + */ +const HOST_ORG_CACHE_MAX = 1024; + +const hostOrgCache = new Map(); + +function getCachedOrgId(host: string): string | undefined { + const hit = hostOrgCache.get(host); + if (!hit) return undefined; + if (hit.expiresAt <= Date.now()) { + hostOrgCache.delete(host); + return undefined; + } + return hit.orgId; +} + +function setCachedOrgId(host: string, orgId: string): void { + hostOrgCache.set(host, { + orgId, + expiresAt: Date.now() + HOST_ORG_CACHE_TTL_MS, + }); + // Map preserves insertion order, so the first key is the oldest. + while (hostOrgCache.size > HOST_ORG_CACHE_MAX) { + const oldest = hostOrgCache.keys().next().value; + if (oldest === undefined) break; + hostOrgCache.delete(oldest); + } +} + +/** + * Clear the host→org cache. Test-only: unit tests reset it between cases so a + * cached entry from one case cannot mask the DB mock in the next. + */ +export function _resetHostOrgCacheForTests(): void { + hostOrgCache.clear(); +} + +/** + * Resolve an org id from an already normalised + lowercased host, hitting the + * DB. Two paths, in order: + * + * 1. Subdomain path (the hot path): if the first label is a syntactically + * valid slug, look up `Organization.slug`. Preserves the + * `.vectorflow.sh` wildcard scheme exactly. + * 2. Custom-domain path: when no slug matches, look up a *verified* + * `OrganizationDomainClaim` (`verifiedAt` not null) whose `domain` + * equals the full host (e.g. `logs.acme.com`). DNS-TXT ownership is + * proof enough to route the host to that org. + * + * No match in either path → `DEFAULT_ORG_ID`. Throws propagate so the caller + * can fail open without caching a transient miss. + */ +async function resolveOrgIdFromHostUncached( + normalizedHost: string, ): Promise { - if (!host) return DEFAULT_ORG_ID; - const slug = extractSlugFromHost(normalizeHost(host)); - if (!slug) return DEFAULT_ORG_ID; - try { + const slug = extractSlugFromHost(normalizedHost); + if (slug) { // Subdomain→org resolution runs before any tenancy scope and reads the // (fenced) Organization table by slug, so it uses the admin connection. const org = await adminPrisma.organization.findUnique({ where: { slug }, select: { id: true }, }); - return org?.id ?? DEFAULT_ORG_ID; + if (org) return org.id; + } + // Custom domains always carry at least one dot; skip the probe for + // single-label hosts (`localhost`) and bare IPs that can never own a claim. + if (normalizedHost.includes(".")) { + // `OrganizationDomainClaim.domain` is stored lowercase + punycode and + // `Host:` values already arrive punycoded, so a lowercase compare matches. + // `@@index([domain])` (and the partial unique index on verified rows) + // make this an indexed equality probe. Admin connection: runs + // pre-tenancy-scope, same as the slug lookup above. + const claim = await adminPrisma.organizationDomainClaim.findFirst({ + where: { domain: normalizedHost, verifiedAt: { not: null } }, + select: { organizationId: true }, + }); + if (claim) return claim.organizationId; + } + return DEFAULT_ORG_ID; +} + +/** + * Map a raw `Host:` header value to an organisation id. Returns + * `DEFAULT_ORG_ID` for OSS hosts, missing hosts, slugs that don't exist, and + * custom domains with no verified claim. This intentionally fails open to the + * default org — cross-org leakage from the wrong direction (e.g. an attacker + * spoofing a slug or `Host:` header) is prevented by RLS + per-org JWT + * secrets, not by this lookup. + * + * Runtime boundary: this performs a DB read, so it MUST only be called from + * the Node runtime. It is consumed by `src/auth.ts` (per-org NextAuth + * instance + OIDC) and the SCIM auth layer, both Node. The edge middleware + * `src/proxy.ts` deliberately does NOT call this — it stays on the + * auth-gate/CSP path and never touches the DB — so custom domains are + * resolved in the Node auth layer, never at the edge. + */ +export async function resolveOrgIdFromHost( + host: string | null | undefined, +): Promise { + if (!host) return DEFAULT_ORG_ID; + const normalizedHost = normalizeHost(host).toLowerCase(); + if (!normalizedHost) return DEFAULT_ORG_ID; + const cached = getCachedOrgId(normalizedHost); + if (cached !== undefined) return cached; + try { + const orgId = await resolveOrgIdFromHostUncached(normalizedHost); + setCachedOrgId(normalizedHost, orgId); + return orgId; } catch { - // DB not reachable (build phase, migration in progress, etc.) — - // OSS behaviour preserved. + // DB not reachable (build phase, migration in progress, etc.) — fail open + // to OSS behaviour. Do NOT cache: a transient miss must not pin the host + // to the default org for the full TTL. return DEFAULT_ORG_ID; } } From 1be40d00fdcd23f2ba282ab6a42d89f3fa54063d Mon Sep 17 00:00:00 2001 From: TerrifiedBug Date: Mon, 8 Jun 2026 14:59:35 +0100 Subject: [PATCH 2/2] fix(host-to-org): verified claim wins over colliding slug (reviewer CL-3) The slug path ran on any multi-label host's first label, so a custom domain (e.g. logs.acme.com) whose first label collides with an existing org slug (logs) was misrouted to the slug-org, shadowing the verified claim (confused-deputy on auth/OIDC/SCIM config). Reorder to claim-first: a DNS-verified full-host OrganizationDomainClaim wins over the slug-prefix match; a claim can't exist for *.vectorflow.sh (no tenant DNS-TXT), so genuine subdomains are unaffected. Cache covers the extra indexed probe on the subdomain hot path. +shadowing test. --- src/lib/__tests__/host-to-org.test.ts | 22 ++++++++++-- src/lib/host-to-org.ts | 51 +++++++++++++++------------ 2 files changed, 48 insertions(+), 25 deletions(-) diff --git a/src/lib/__tests__/host-to-org.test.ts b/src/lib/__tests__/host-to-org.test.ts index 2ada6483..a0e09117 100644 --- a/src/lib/__tests__/host-to-org.test.ts +++ b/src/lib/__tests__/host-to-org.test.ts @@ -138,8 +138,7 @@ describe("resolveOrgIdFromHost", () => { }); it("resolves a verified claim for a custom domain to its org", async () => { - // "logs" is a syntactically valid slug, so the resolver tries the - // subdomain path first (miss) before matching the verified claim. + // Claim-first: a verified full-host claim wins regardless of the slug path. prismaMock.organization.findUnique.mockResolvedValue(null); prismaMock.organizationDomainClaim.findFirst.mockResolvedValue({ organizationId: "org-acme-uuid", @@ -154,6 +153,23 @@ describe("resolveOrgIdFromHost", () => { }); }); + it("a verified claim wins over a colliding org slug (no shadowing)", async () => { + // org B owns slug "logs"; org A has a verified claim on logs.acme.com. + // The DNS-proven full-host claim MUST win over the slug-prefix match. + prismaMock.organization.findUnique.mockResolvedValue({ + id: "org-B-slug-logs", + } as never); + prismaMock.organizationDomainClaim.findFirst.mockResolvedValue({ + organizationId: "org-A-claim", + } as never); + + await expect(resolveOrgIdFromHost("logs.acme.com")).resolves.toBe( + "org-A-claim", + ); + // Claim-first short-circuits, so the colliding slug is never consulted. + expect(prismaMock.organization.findUnique).not.toHaveBeenCalled(); + }); + it("does NOT resolve an unverified claim (falls back to DEFAULT_ORG_ID)", async () => { prismaMock.organization.findUnique.mockResolvedValue(null); // Simulate the DB: a claim row exists for the domain but is unverified, so @@ -203,6 +219,6 @@ describe("resolveOrgIdFromHost", () => { expect(prismaMock.organizationDomainClaim.findFirst).toHaveBeenCalledTimes( 1, ); - expect(prismaMock.organization.findUnique).toHaveBeenCalledTimes(1); + expect(prismaMock.organization.findUnique).not.toHaveBeenCalled(); }); }); diff --git a/src/lib/host-to-org.ts b/src/lib/host-to-org.ts index 331095b1..5f48f199 100644 --- a/src/lib/host-to-org.ts +++ b/src/lib/host-to-org.ts @@ -115,46 +115,53 @@ export function _resetHostOrgCacheForTests(): void { /** * Resolve an org id from an already normalised + lowercased host, hitting the - * DB. Two paths, in order: + * DB. Two paths, in PRECEDENCE order: * - * 1. Subdomain path (the hot path): if the first label is a syntactically + * 1. Custom-domain path: a *verified* `OrganizationDomainClaim` + * (`verifiedAt` not null) whose `domain` equals the FULL host (e.g. + * `logs.acme.com`). DNS-TXT ownership of the whole host is the strongest + * signal, so it MUST win over the slug path — otherwise a custom domain + * whose first label happens to collide with an existing org slug (e.g. + * `logs.acme.com` vs an org with slug `logs`) would misroute to the + * slug-org. A claim can only exist for a real custom domain (a tenant + * cannot set DNS-TXT on `*.vectorflow.sh`), so this never shadows a + * genuine `.vectorflow.sh` subdomain. + * 2. Subdomain path (the common case): if the first label is a syntactically * valid slug, look up `Organization.slug`. Preserves the * `.vectorflow.sh` wildcard scheme exactly. - * 2. Custom-domain path: when no slug matches, look up a *verified* - * `OrganizationDomainClaim` (`verifiedAt` not null) whose `domain` - * equals the full host (e.g. `logs.acme.com`). DNS-TXT ownership is - * proof enough to route the host to that org. * * No match in either path → `DEFAULT_ORG_ID`. Throws propagate so the caller - * can fail open without caching a transient miss. + * can fail open without caching a transient miss. The claim probe is an indexed + * point query and the result is TTL-cached, so the common subdomain path pays + * it only on a cold cache. */ async function resolveOrgIdFromHostUncached( normalizedHost: string, ): Promise { - const slug = extractSlugFromHost(normalizedHost); - if (slug) { - // Subdomain→org resolution runs before any tenancy scope and reads the - // (fenced) Organization table by slug, so it uses the admin connection. - const org = await adminPrisma.organization.findUnique({ - where: { slug }, - select: { id: true }, - }); - if (org) return org.id; - } - // Custom domains always carry at least one dot; skip the probe for - // single-label hosts (`localhost`) and bare IPs that can never own a claim. + // 1. Verified custom-domain claim on the full host wins (DNS-TXT ownership). + // Custom domains always carry at least one dot; skip the probe for + // single-label hosts (`localhost`) and bare IPs that can never own a claim. if (normalizedHost.includes(".")) { // `OrganizationDomainClaim.domain` is stored lowercase + punycode and // `Host:` values already arrive punycoded, so a lowercase compare matches. - // `@@index([domain])` (and the partial unique index on verified rows) - // make this an indexed equality probe. Admin connection: runs - // pre-tenancy-scope, same as the slug lookup above. + // `@@index([domain])` (+ the partial unique index on verified rows) make + // this an indexed equality probe. Admin connection: runs pre-tenancy-scope. const claim = await adminPrisma.organizationDomainClaim.findFirst({ where: { domain: normalizedHost, verifiedAt: { not: null } }, select: { organizationId: true }, }); if (claim) return claim.organizationId; } + // 2. Subdomain→org by slug (the `.vectorflow.sh` hot path). Uses the + // admin connection (reads the fenced Organization table pre-tenancy-scope). + const slug = extractSlugFromHost(normalizedHost); + if (slug) { + const org = await adminPrisma.organization.findUnique({ + where: { slug }, + select: { id: true }, + }); + if (org) return org.id; + } return DEFAULT_ORG_ID; }