Skip to content

Commit 0f2290b

Browse files
authored
Send log-event if worker is restarted because of memory pressure (#8617)
1 parent cd9951b commit 0f2290b

2 files changed

Lines changed: 15 additions & 2 deletions

File tree

distributed/tests/test_nanny.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -660,6 +660,12 @@ async def test_restart_memory(c, s, n):
660660
while not s.workers:
661661
await asyncio.sleep(0.1)
662662

663+
msgs = s.get_events("worker-restart-memory")
664+
assert len(msgs)
665+
msg = msgs[0][1]
666+
assert isinstance(msg, dict)
667+
assert {"worker", "pid", "rss"}.issubset(set(msg))
668+
663669

664670
class BlockClose(WorkerPlugin):
665671
def __init__(self, close_happened):

distributed/worker_memory.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -417,12 +417,19 @@ def memory_monitor(self, nanny: Nanny) -> None:
417417
return
418418

419419
if self._last_terminated_pid != process.pid:
420-
nanny_logger.warning(
420+
msg = (
421421
f"Worker {nanny.worker_address} (pid={process.pid}) exceeded "
422422
f"{self.memory_terminate_fraction * 100:.0f}% memory budget. "
423-
"Restarting...",
423+
f"Restarting..."
424424
)
425+
nanny_logger.warning(msg)
425426
self._last_terminated_pid = process.pid
427+
event = {
428+
"worker": nanny.worker_address,
429+
"pid": process.pid,
430+
"rss": memory,
431+
}
432+
nanny.log_event("worker-restart-memory", event)
426433
process.terminate()
427434
else:
428435
# We already sent SIGTERM to the worker, but the process is still alive

0 commit comments

Comments
 (0)