Skip to content

Commit 9b9e590

Browse files
authored
feat: health monitoring and visualization (#53)
closes #43 Command to run test ```bash deno test --allow-net --allow-env src/backend/tests/container-health.test.ts
2 parents 11e6726 + a0c516e commit 9b9e590

14 files changed

Lines changed: 2284 additions & 20 deletions

docker/dev.docker-compose.yml

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ services:
1717
- "${PORT_BACKEND}:7000"
1818
env_file:
1919
- ../src/backend/.env
20+
depends_on:
21+
- prometheus
2022
vue:
2123
container_name: df_frontend
2224
image: df_frontend
@@ -27,3 +29,39 @@ services:
2729
target: base
2830
ports:
2931
- "${PORT_FRONTEND}:8000"
32+
33+
cadvisor:
34+
image: gcr.io/cadvisor/cadvisor:v0.47.0
35+
container_name: df_cadvisor
36+
restart: unless-stopped
37+
mem_limit: 128m
38+
privileged: true
39+
volumes:
40+
- /:/rootfs:ro
41+
- /var/run:/var/run:ro
42+
- /sys:/sys:ro
43+
- /var/lib/docker/:/var/lib/docker:ro
44+
ports:
45+
- "8081:8080"
46+
47+
prometheus:
48+
image: prom/prometheus:v2.48.0
49+
container_name: df_prometheus
50+
restart: unless-stopped
51+
mem_limit: 384m
52+
command:
53+
- '--config.file=/etc/prometheus/prometheus.yml'
54+
- '--storage.tsdb.path=/prometheus'
55+
- '--storage.tsdb.retention.time=24h'
56+
- '--storage.tsdb.retention.size=256MB'
57+
- '--web.enable-lifecycle'
58+
volumes:
59+
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
60+
- prometheus_data:/prometheus
61+
ports:
62+
- "9090:9090"
63+
depends_on:
64+
- cadvisor
65+
66+
volumes:
67+
prometheus_data:

docker/docker-compose.yml

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ services:
1717
- "${PORT_BACKEND}:7000"
1818
env_file:
1919
- ../src/backend/.env
20+
depends_on:
21+
- prometheus
2022
vue:
2123
container_name: df_frontend
2224
image: df_frontend
@@ -27,3 +29,39 @@ services:
2729
target: base
2830
ports:
2931
- "${PORT_FRONTEND}:8000"
32+
33+
cadvisor:
34+
image: gcr.io/cadvisor/cadvisor:v0.47.0
35+
container_name: df_cadvisor
36+
restart: unless-stopped
37+
mem_limit: 128m
38+
privileged: true
39+
volumes:
40+
- /:/rootfs:ro
41+
- /var/run:/var/run:ro
42+
- /sys:/sys:ro
43+
- /var/lib/docker/:/var/lib/docker:ro
44+
ports:
45+
- "8081:8080"
46+
47+
prometheus:
48+
image: prom/prometheus:v2.48.0
49+
container_name: df_prometheus
50+
restart: unless-stopped
51+
mem_limit: 384m
52+
command:
53+
- '--config.file=/etc/prometheus/prometheus.yml'
54+
- '--storage.tsdb.path=/prometheus'
55+
- '--storage.tsdb.retention.time=24h'
56+
- '--storage.tsdb.retention.size=256MB'
57+
- '--web.enable-lifecycle'
58+
volumes:
59+
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
60+
- prometheus_data:/prometheus
61+
ports:
62+
- "9090:9090"
63+
depends_on:
64+
- cadvisor
65+
66+
volumes:
67+
prometheus_data:

docker/prometheus.yml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
global:
2+
scrape_interval: 15s
3+
evaluation_interval: 15s
4+
external_labels:
5+
monitor: 'domain-forge'
6+
7+
scrape_configs:
8+
- job_name: 'cadvisor'
9+
static_configs:
10+
- targets: ['cadvisor:8080']
11+
metric_relabel_configs:
12+
- source_labels: [__name__]
13+
regex: 'container_(cpu_usage_seconds_total|memory_usage_bytes|memory_max_usage_bytes|network_.*_bytes_total|fs_usage_bytes|fs_limit_bytes)'
14+
action: keep
15+
- job_name: 'prometheus'
16+
static_configs:
17+
- targets: ['localhost:9090']
18+
metric_relabel_configs:
19+
- source_labels: [__name__]
20+
regex: 'prometheus_(build_info|target_interval_length_seconds)'
21+
action: keep

src/backend/.env.sample

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,12 @@ MONGO_APP_ID=...
77
SENTRY_DSN=...
88
FRONTEND=...
99
ADMIN_LIST=admin1|admin2
10-
MEMORY_LIMIT=500m
10+
MEMORY_LIMIT=500m
11+
12+
# Health Monitor Configuration
13+
PROMETHEUS_URL=http://prometheus:9090
14+
HEALTH_CHECK_INTERVAL=30000
15+
HEALTH_DEBUG=false
16+
MAX_CPU_THRESHOLD=90
17+
MAX_MEMORY_THRESHOLD=85
18+
MAX_RESTART_COUNT=5

src/backend/deno.json

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
{
2+
"tasks": {
3+
"start": "deno run --allow-net --allow-read --allow-env server.ts",
4+
"test": "deno test --allow-env --allow-net ./tests/"
5+
},
6+
"imports": {
7+
"@std/assert": "jsr:@std/assert@1"
8+
}
9+
}

src/backend/health-api.ts

Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
import { Context } from "./dependencies.ts";
2+
import {
3+
getContainerHistory,
4+
getHealthSummary,
5+
isUnhealthy,
6+
type TimeStep,
7+
type TimeRange,
8+
} from "./utils/container-health.ts";
9+
import { restartContainer, getRestartCount } from "./utils/auto-restart.ts";
10+
import { getMonitorStatus, triggerHealthCheck } from "./health-monitor.ts";
11+
import { checkJWT } from "./utils/jwt.ts";
12+
13+
const TIME_RANGE_PRESETS: Record<TimeStep, TimeRange> = {
14+
'1s': { step: '1s', duration: '5m' },
15+
'15s': { step: '15s', duration: '15m' },
16+
'1m': { step: '1m', duration: '1h' },
17+
'5m': { step: '5m', duration: '6h' },
18+
'1h': { step: '1h', duration: '24h' },
19+
'1d': { step: '1d', duration: '7d' },
20+
};
21+
22+
23+
export async function getContainerHealth(ctx: Context): Promise<void> {
24+
const author = ctx.request.url.searchParams.get("user");
25+
const token = ctx.request.url.searchParams.get("token");
26+
const provider = ctx.request.url.searchParams.get("provider");
27+
28+
if (author !== await checkJWT(provider!, token!)) {
29+
ctx.throw(401);
30+
}
31+
32+
const summary = await getHealthSummary();
33+
34+
ctx.response.headers.set("Access-Control-Allow-Origin", "*");
35+
ctx.response.body = {
36+
total: summary.total,
37+
healthy: summary.healthy,
38+
unhealthy: summary.unhealthy,
39+
containers: summary.containers.map(c => ({
40+
name: c.name,
41+
subdomain: c.subdomain,
42+
status: c.status,
43+
cpuPercent: Math.round(c.cpuPercent * 100) / 100,
44+
memoryPercent: Math.round(c.memoryPercent * 100) / 100,
45+
memoryUsageMB: Math.round(c.memoryUsage / (1024 * 1024)),
46+
restartCount: getRestartCount(c.name),
47+
isHealthy: !isUnhealthy(c),
48+
lastUpdated: c.lastUpdated.toISOString(),
49+
})),
50+
};
51+
}
52+
53+
54+
export async function getContainerMetrics(ctx: Context): Promise<void> {
55+
const subdomain = ctx.params.subdomain;
56+
const stepParam = ctx.request.url.searchParams.get("step") || '1m';
57+
const author = ctx.request.url.searchParams.get("user");
58+
const token = ctx.request.url.searchParams.get("token");
59+
const provider = ctx.request.url.searchParams.get("provider");
60+
61+
if (author !== await checkJWT(provider!, token!)) {
62+
ctx.throw(401);
63+
}
64+
65+
const step = stepParam as TimeStep;
66+
const range = TIME_RANGE_PRESETS[step] || TIME_RANGE_PRESETS['1m'];
67+
68+
const history = await getContainerHistory(subdomain, range);
69+
70+
ctx.response.headers.set("Access-Control-Allow-Origin", "*");
71+
ctx.response.body = {
72+
subdomain,
73+
step: range.step,
74+
duration: range.duration,
75+
dataPoints: history.cpu.length,
76+
cpu: history.cpu.map(([ts, val]) => ({
77+
timestamp: new Date(ts).toISOString(),
78+
value: Math.round(val * 100) / 100,
79+
})),
80+
memory: history.memory.map(([ts, val]) => ({
81+
timestamp: new Date(ts).toISOString(),
82+
valueMB: Math.round(val / (1024 * 1024)),
83+
})),
84+
};
85+
}
86+
87+
88+
export async function getHealthDashboard(ctx: Context): Promise<void> {
89+
const author = ctx.request.url.searchParams.get("user");
90+
const token = ctx.request.url.searchParams.get("token");
91+
const provider = ctx.request.url.searchParams.get("provider");
92+
93+
if (author !== await checkJWT(provider!, token!)) {
94+
ctx.throw(401);
95+
}
96+
97+
const summary = await getHealthSummary();
98+
const monitorStatus = getMonitorStatus();
99+
100+
ctx.response.headers.set("Access-Control-Allow-Origin", "*");
101+
ctx.response.body = {
102+
overview: {
103+
total: summary.total,
104+
healthy: summary.healthy,
105+
unhealthy: summary.unhealthy,
106+
healthPercent: summary.total > 0
107+
? Math.round((summary.healthy / summary.total) * 100)
108+
: 100,
109+
},
110+
monitor: {
111+
running: monitorStatus.running,
112+
checkIntervalMs: monitorStatus.interval,
113+
thresholds: monitorStatus.thresholds,
114+
},
115+
unhealthyContainers: summary.containers
116+
.filter(c => isUnhealthy(c))
117+
.map(c => ({
118+
name: c.name,
119+
subdomain: c.subdomain,
120+
reason: getUnhealthyReason(c),
121+
restartAttempts: monitorStatus.restartAttempts[c.name]?.count || 0,
122+
})),
123+
};
124+
}
125+
126+
127+
export async function restartContainerHandler(ctx: Context): Promise<void> {
128+
const subdomain = ctx.params.subdomain;
129+
130+
const body = await ctx.request.body().value;
131+
let document;
132+
try {
133+
document = typeof body === 'string' ? JSON.parse(body) : body;
134+
} catch {
135+
document = body;
136+
}
137+
138+
const author = document?.author;
139+
const token = document?.token;
140+
const provider = document?.provider;
141+
142+
if (author !== await checkJWT(provider, token)) {
143+
ctx.throw(401);
144+
}
145+
146+
try {
147+
await restartContainer(subdomain);
148+
149+
ctx.response.headers.set("Access-Control-Allow-Origin", "*");
150+
ctx.response.body = {
151+
status: "success",
152+
message: `Container ${subdomain} restart initiated`,
153+
};
154+
} catch (error) {
155+
ctx.response.status = 500;
156+
ctx.response.body = {
157+
status: "error",
158+
message: `Failed to restart ${subdomain}: ${error}`,
159+
};
160+
}
161+
}
162+
163+
164+
export async function triggerHealthCheckHandler(ctx: Context): Promise<void> {
165+
const body = await ctx.request.body().value;
166+
let document;
167+
try {
168+
document = typeof body === 'string' ? JSON.parse(body) : body;
169+
} catch {
170+
document = body;
171+
}
172+
173+
const ADMIN_LIST = Deno.env.get("ADMIN_LIST")?.split("|") || [];
174+
const author = document?.author;
175+
const token = document?.token;
176+
const provider = document?.provider;
177+
178+
if (author !== await checkJWT(provider, token) || !ADMIN_LIST.includes(author)) {
179+
ctx.throw(401);
180+
}
181+
182+
await triggerHealthCheck();
183+
184+
ctx.response.headers.set("Access-Control-Allow-Origin", "*");
185+
ctx.response.body = {
186+
status: "success",
187+
message: "Health check triggered",
188+
};
189+
}
190+
191+
192+
function getUnhealthyReason(c: { cpuPercent: number; memoryPercent: number; restartCount: number; status: string }): string {
193+
const reasons: string[] = [];
194+
195+
if (c.cpuPercent > 90) reasons.push(`High CPU (${c.cpuPercent.toFixed(1)}%)`);
196+
if (c.memoryPercent > 85) reasons.push(`High Memory (${c.memoryPercent.toFixed(1)}%)`);
197+
if (c.restartCount > 5) reasons.push(`Many restarts (${c.restartCount})`);
198+
if (c.status === 'exited') reasons.push('Container exited');
199+
if (c.status === 'unhealthy') reasons.push('Health check failed');
200+
201+
return reasons.join(', ') || 'Unknown';
202+
}

0 commit comments

Comments
 (0)