Skip to content

Commit e7f756b

Browse files
authored
Fix GPU Dashboard (#8572)
1 parent e434793 commit e7f756b

2 files changed

Lines changed: 5 additions & 5 deletions

File tree

distributed/dashboard/components/nvml.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -125,13 +125,11 @@ def update(self):
125125

126126
for idx, ws in enumerate(workers):
127127
try:
128-
info = ws.extra["gpu"]
128+
mem_used = ws.metrics["gpu_memory_used"]
129+
mem_total = ws.metrics["gpu-memory-total"]
130+
u = ws.metrics["gpu_utilization"]
129131
except KeyError:
130132
continue
131-
metrics = ws.metrics["gpu"]
132-
u = metrics["utilization"]
133-
mem_used = metrics["memory-used"]
134-
mem_total = info["memory-total"]
135133
memory_max = max(memory_max, mem_total)
136134
memory_total += mem_total
137135
utilization.append(int(u))

distributed/system_monitor.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@ def __init__(
129129
gpu_extra = nvml.one_time()
130130
self.gpu_name = gpu_extra["name"]
131131
self.gpu_memory_total = gpu_extra["memory-total"]
132+
self.quantities["gpu-memory-total"] = deque(maxlen=1)
132133
self.quantities["gpu_utilization"] = deque(maxlen=maxlen)
133134
self.quantities["gpu_memory_used"] = deque(maxlen=maxlen)
134135
else:
@@ -207,6 +208,7 @@ def update(self) -> dict[str, Any]:
207208

208209
if self.gpu_name:
209210
gpu_metrics = nvml.real_time()
211+
result["gpu-memory-total"] = self.gpu_memory_total
210212
result["gpu_utilization"] = gpu_metrics["utilization"]
211213
result["gpu_memory_used"] = gpu_metrics["memory-used"]
212214

0 commit comments

Comments
 (0)