File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -125,13 +125,11 @@ def update(self):
125125
126126 for idx , ws in enumerate (workers ):
127127 try :
128- info = ws .extra ["gpu" ]
128+ mem_used = ws .metrics ["gpu_memory_used" ]
129+ mem_total = ws .metrics ["gpu-memory-total" ]
130+ u = ws .metrics ["gpu_utilization" ]
129131 except KeyError :
130132 continue
131- metrics = ws .metrics ["gpu" ]
132- u = metrics ["utilization" ]
133- mem_used = metrics ["memory-used" ]
134- mem_total = info ["memory-total" ]
135133 memory_max = max (memory_max , mem_total )
136134 memory_total += mem_total
137135 utilization .append (int (u ))
Original file line number Diff line number Diff line change @@ -129,6 +129,7 @@ def __init__(
129129 gpu_extra = nvml .one_time ()
130130 self .gpu_name = gpu_extra ["name" ]
131131 self .gpu_memory_total = gpu_extra ["memory-total" ]
132+ self .quantities ["gpu-memory-total" ] = deque (maxlen = 1 )
132133 self .quantities ["gpu_utilization" ] = deque (maxlen = maxlen )
133134 self .quantities ["gpu_memory_used" ] = deque (maxlen = maxlen )
134135 else :
@@ -207,6 +208,7 @@ def update(self) -> dict[str, Any]:
207208
208209 if self .gpu_name :
209210 gpu_metrics = nvml .real_time ()
211+ result ["gpu-memory-total" ] = self .gpu_memory_total
210212 result ["gpu_utilization" ] = gpu_metrics ["utilization" ]
211213 result ["gpu_memory_used" ] = gpu_metrics ["memory-used" ]
212214
You can’t perform that action at this time.
0 commit comments