Skip to content

Commit 8636830

Browse files
corbtCursor Bot
andauthored
feat: move trajectory logging from backend to frontend (#518)
* feat: move trajectory logging from backend to frontend (Model class) Implements RFC #511 - moves trajectory persistence and metrics reporting from the backend to the Model class, allowing backends to focus solely on training. Key changes: - Model.log() now handles parquet writing, metrics calculation, history.jsonl - Model gains base_path and report_metrics attributes for configuration - TrainableModel.train() calls log() first, then backend._train_model() - TrainableModel.delete_checkpoints() reads history.jsonl locally - Backend._log() removed; _delete_checkpoints() renamed to _delete_checkpoint_files() - Removed scale_learning_rate_by_reward_std_dev handling (users can implement if needed) Breaking changes: - Backend interface: _log() removed, _delete_checkpoints() renamed - ServerlessBackend: trajectories now saved locally instead of sent to API Closes #511 * fix: sort imports in vllm/server.py --------- Co-authored-by: Cursor Bot <bot@cursor.com>
1 parent beb0315 commit 8636830

7 files changed

Lines changed: 755 additions & 343 deletions

File tree

src/art/backend.py

Lines changed: 4 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -55,16 +55,14 @@ async def _get_step(self, model: "TrainableModel") -> int:
5555
response.raise_for_status()
5656
return response.json()
5757

58-
async def _delete_checkpoints(
58+
async def _delete_checkpoint_files(
5959
self,
6060
model: "TrainableModel",
61-
benchmark: str,
62-
benchmark_smoothing: float,
61+
steps_to_keep: list[int],
6362
) -> None:
6463
response = await self._client.post(
65-
"/_delete_checkpoints",
66-
json=model.safe_model_dump(),
67-
params={"benchmark": benchmark, "benchmark_smoothing": benchmark_smoothing},
64+
"/_delete_checkpoint_files",
65+
json={"model": model.safe_model_dump(), "steps_to_keep": steps_to_keep},
6866
)
6967
response.raise_for_status()
7068

@@ -82,23 +80,6 @@ async def _prepare_backend_for_training(
8280
base_url, api_key = tuple(response.json())
8381
return base_url, api_key
8482

85-
async def _log(
86-
self,
87-
model: "Model",
88-
trajectory_groups: list[TrajectoryGroup],
89-
split: str = "val",
90-
) -> None:
91-
response = await self._client.post(
92-
"/_log",
93-
json={
94-
"model": model.safe_model_dump(),
95-
"trajectory_groups": [tg.model_dump() for tg in trajectory_groups],
96-
"split": split,
97-
},
98-
timeout=None,
99-
)
100-
response.raise_for_status()
101-
10283
async def _train_model(
10384
self,
10485
model: "TrainableModel",

src/art/cli.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,13 @@ async def art_error_handler(request: Request, exc: ARTError):
173173
app.post("/close")(backend.close)
174174
app.post("/register")(backend.register)
175175
app.post("/_get_step")(backend._get_step)
176-
app.post("/_delete_checkpoints")(backend._delete_checkpoints)
176+
177+
@app.post("/_delete_checkpoint_files")
178+
async def _delete_checkpoint_files(
179+
model: TrainableModel = Body(...),
180+
steps_to_keep: list[int] = Body(...),
181+
):
182+
await backend._delete_checkpoint_files(model, steps_to_keep)
177183

178184
@app.post("/_prepare_backend_for_training")
179185
async def _prepare_backend_for_training(
@@ -182,13 +188,7 @@ async def _prepare_backend_for_training(
182188
):
183189
return await backend._prepare_backend_for_training(model, config)
184190

185-
@app.post("/_log")
186-
async def _log(
187-
model: Model,
188-
trajectory_groups: list[TrajectoryGroup],
189-
split: str = Body("val"),
190-
):
191-
await backend._log(model, trajectory_groups, split)
191+
# Note: /_log endpoint removed - logging now handled by frontend (Model.log())
192192

193193
@app.post("/_train_model")
194194
async def _train_model(

0 commit comments

Comments
 (0)