refactor: simplify LocalBackend pipeline trainer integration

vivekkalyan · vivekkalyan · commit 47579de3e8db · 2026-03-18T14:52:01.000-07:00
diff --git a/src/art/local/backend.py b/src/art/local/backend.py
@@ -168,25 +168,6 @@ def __exit__(
         exc: BaseException | None,
         tb: TracebackType | None,
     ) -> None:
-        try:
-            asyncio.get_running_loop()
-        except RuntimeError:
-            running_loop = False
-        else:
-            running_loop = True
-
-        if running_loop or any(
-            getattr(service, "aclose", None) is not None
-            for service in self._services.values()
-        ):
-            warnings.warn(
-                "LocalBackend used as a sync context manager. Cleanup uses the "
-                "best-effort sync shutdown path and cannot await service "
-                "teardown safely here; use `async with LocalBackend(...)` or "
-                "`await backend.close()` instead.",
-                RuntimeWarning,
-                stacklevel=2,
-            )
         self._close()
 
     async def __aexit__(
@@ -201,20 +182,18 @@ async def close(self) -> None:
         """
         If running vLLM in a separate process, this will kill that process and close the communication threads.
         """
-        for _, service in self._services.items():
-            # Keep this logic aligned with _close(), but avoid double-closing
-            # services that expose an awaited aclose() path.
+        for service in self._services.values():
             aclose = getattr(service, "aclose", None)
-            if aclose is not None:
-                await aclose()
-            else:
+            if aclose is None:
                 close = getattr(service, "close", None)
                 if close is not None:
                     close()
+            else:
+                await aclose()
             close_proxy(service)
 
     def _close(self) -> None:
-        for _, service in self._services.items():
+        for service in self._services.values():
             close = getattr(service, "close", None)
             if close is not None:
                 close()
@@ -259,35 +238,27 @@ def _model_inference_name(self, model: Model, step: int | None = None) -> str:
                   If None, returns name for latest checkpoint (step 0 initially).
         """
 
-        def _served_step() -> int | None:
-            if not isinstance(model, TrainableModel):
-                return None
-            if model.name not in self._services:
-                return None
+        requested_step = step
+
+        if step is None and isinstance(model, TrainableModel):
             from ..dev.validate import is_dedicated_mode
 
-            if not is_dedicated_mode(
+            service = self._services.get(model.name)
+            if service is not None and is_dedicated_mode(
                 model._internal_config or dev.InternalModelConfig()
             ):
-                return None
-            loaded_step = getattr(self._services[model.name], "_latest_step", None)
-            return loaded_step if isinstance(loaded_step, int) else None
-
-        # For LocalBackend, vLLM always serves LoRA adapters with @step suffix
-        # Default to step 0 when not specified (the initial checkpoint created at registration)
-        if step is not None:
-            actual_step = step
-        else:
-            # In dedicated mode the service tracks which adapter vLLM has
-            # actually loaded. Reading the filesystem would race: the checkpoint
-            # directory appears before the HTTP reload completes.
-            actual_step = _served_step()
-            if actual_step is None:
-                actual_step = self.__get_step(model)
-        name = f"{model.name}@{actual_step}"
+                loaded_step = getattr(service, "_latest_step", None)
+                if isinstance(loaded_step, int):
+                    step = loaded_step
+
+        if step is None:
+            # The checkpoint directory is written before dedicated-mode
+            # vLLM finishes reloading the new adapter.
+            step = self.__get_step(model)
+        name = f"{model.name}@{step}"
         logger.debug(
-            f"[BACKEND] _model_inference_name: step_arg={step} "
-            f"actual_step={actual_step} -> {name}"
+            f"[BACKEND] _model_inference_name: step_arg={requested_step} "
+            f"actual_step={step} -> {name}"
         )
         return name
 
@@ -552,12 +523,14 @@ async def train(  # type: ignore[override]
         *,
         # Core training parameters
         learning_rate: float = 5e-6,
+        loss_fn: Literal["cispo", "ppo", "importance_sampling", "dro"] = "cispo",
+        loss_fn_config: dict | None = None,
+        normalize_advantages: bool = True,
+        adam_params: object | None = None,
         # KL-penalized advantage adjustment
         kl_penalty_coef: float = 0.0,
         kl_penalty_reference_step: int | None = None,
         kl_ref_adapter_path: str | None = None,
-        # RL algorithm settings
-        ppo: bool = False,
         epsilon: float | None = None,
         epsilon_high: float | None = None,
         # Advantage computation
@@ -594,6 +567,14 @@ async def train(  # type: ignore[override]
             model: The trainable model to train.
             trajectory_groups: Batches of trajectories to train on.
             learning_rate: Learning rate for training. Defaults to 5e-6.
+            loss_fn: RL loss function. LocalBackend currently supports
+                "cispo" and "ppo".
+            loss_fn_config: Additional loss-function config. Not supported by
+                LocalBackend.
+            normalize_advantages: Whether to normalize advantages. LocalBackend
+                currently requires True.
+            adam_params: Custom optimizer params. Not supported by
+                LocalBackend.
             kl_penalty_coef: Coefficient for KL-penalized advantage adjustment.
                 Tokens diverging more from the reference get reduced advantages.
                 Defaults to 0.0 (disabled).
@@ -603,7 +584,6 @@ async def train(  # type: ignore[override]
             kl_ref_adapter_path: Direct filesystem path to a LoRA adapter
                 checkpoint to use as the KL reference. Alternative to
                 kl_penalty_reference_step.
-            ppo: Whether to use PPO clipping. Defaults to False.
             epsilon: Clip epsilon for importance sampling. Defaults based on ppo.
             epsilon_high: Asymmetric upper clip bound. Defaults to epsilon.
             advantage_balance: Balance between negative and positive advantages
@@ -647,6 +627,14 @@ async def train(  # type: ignore[override]
             # await model.log(metrics=result.metrics, step=result.step)
         """
         groups_list = list(trajectory_groups)
+        if loss_fn not in {"cispo", "ppo"}:
+            raise ValueError("LocalBackend only supports loss_fn='cispo' or 'ppo'.")
+        if loss_fn_config is not None:
+            raise ValueError("LocalBackend requires loss_fn_config=None.")
+        if not normalize_advantages:
+            raise ValueError("LocalBackend requires normalize_advantages=True.")
+        if adam_params is not None:
+            raise ValueError("LocalBackend requires adam_params=None.")
 
         # Build config objects from explicit kwargs
         config = TrainConfig(
@@ -659,7 +647,7 @@ async def train(  # type: ignore[override]
             "kl_penalty_coef": kl_penalty_coef,
             "mask_prob_ratio": mask_prob_ratio,
             "plot_tensors": plot_tensors,
-            "ppo": ppo,
+            "ppo": loss_fn == "ppo",
             "precalculate_logprobs": precalculate_logprobs,
             "scale_learning_rate_by_reward_std_dev": scale_learning_rate_by_reward_std_dev,
             "scale_rewards": scale_rewards,
diff --git a/src/art/pipeline_trainer/trainer.py b/src/art/pipeline_trainer/trainer.py
@@ -278,35 +278,22 @@ async def _notify_policy() -> None:
             except asyncio.QueueFull:
                 loop.create_task(self._output_queue.put(None))
 
-    def _is_local_backend(self) -> bool:
-        from art.local.backend import LocalBackend
-
-        return isinstance(self.backend, LocalBackend)
-
-    def _local_backend_is_dedicated(self) -> bool:
-        if not isinstance(self.model, art.TrainableModel):
-            return False
+    def _validate_backend_support(self) -> None:
         from art.dev.validate import is_dedicated_mode
+        from art.local.backend import LocalBackend
 
-        return is_dedicated_mode(
-            self.model._internal_config or art.dev.InternalModelConfig()
-        )
-
-    def _validate_backend_support(self) -> None:
-        if not self._is_local_backend():
-            return
-        if self._local_backend_is_dedicated():
-            self._validate_local_backend_train_config()
+        if not isinstance(self.backend, LocalBackend):
             return
-        raise ValueError(
-            "PipelineTrainer only supports LocalBackend in dedicated mode. "
-            "Shared LocalBackend pauses inference during training and is not "
-            "a supported async PipelineTrainer path. Set both "
-            "trainer_gpu_ids and inference_gpu_ids on the TrainableModel "
-            "_internal_config to use LocalBackend with PipelineTrainer."
-        )
 
-    def _validate_local_backend_train_config(self) -> None:
+        model_config = self.model._internal_config or art.dev.InternalModelConfig()
+        if not is_dedicated_mode(model_config):
+            raise ValueError(
+                "PipelineTrainer only supports LocalBackend in dedicated mode. "
+                "Shared LocalBackend pauses inference during training and is not "
+                "a supported async PipelineTrainer path. Set both "
+                "trainer_gpu_ids and inference_gpu_ids on the TrainableModel "
+                "_internal_config to use LocalBackend with PipelineTrainer."
+            )
         if self.loss_fn not in {"cispo", "ppo"}:
             raise ValueError(
                 "PipelineTrainer + LocalBackend(dedicated) only supports "
@@ -327,23 +314,6 @@ def _validate_local_backend_train_config(self) -> None:
                 "PipelineTrainer + LocalBackend(dedicated) requires adam_params=None."
             )
 
-    def _backend_train_kwargs(self, *, save_checkpoint: bool) -> dict[str, Any]:
-        if not self._is_local_backend():
-            return {
-                "learning_rate": self.learning_rate,
-                "loss_fn": self.loss_fn,
-                "loss_fn_config": self.loss_fn_config,
-                "normalize_advantages": self.normalize_advantages,
-                "save_checkpoint": save_checkpoint,
-                "adam_params": self.adam_params,
-            }
-
-        return {
-            "learning_rate": self.learning_rate,
-            "ppo": self.loss_fn == "ppo",
-            "save_checkpoint": save_checkpoint,
-        }
-
     async def _skip_scenarios(
         self, scenarios: AsyncIterator[ScenarioT], count: int
     ) -> int:
@@ -479,14 +449,18 @@ async def _training_stage(self) -> None:
 
             self._status.note_training_start(len(batch))
             train_call_start = time.monotonic()
-            train_kwargs = self._backend_train_kwargs(save_checkpoint=should_checkpoint)
             if os.getenv("ART_TRAIN_STEP_LOG"):
                 print(f"[train] step {expected_step} starting (batch={len(batch)})")
             try:
                 result = await self.backend.train(
                     self.model,
                     batch,
-                    **train_kwargs,
+                    learning_rate=self.learning_rate,
+                    loss_fn=self.loss_fn,
+                    loss_fn_config=self.loss_fn_config,
+                    normalize_advantages=self.normalize_advantages,
+                    save_checkpoint=should_checkpoint,
+                    adam_params=self.adam_params,
                 )
             except Exception:
                 self._status.note_training_end()
diff --git a/src/art/unsloth/service.py b/src/art/unsloth/service.py
@@ -60,8 +60,8 @@ class _StopTrainInputs:
 
 
 _STOP_TRAIN_INPUT = _StopTrainInputs()
-_TRAIN_TASK_GRACEFUL_SHUTDOWN_TIMEOUT_S = 5.0
-_TRAIN_TASK_CANCEL_TIMEOUT_S = 1.0
+_TRAIN_TASK_SHUTDOWN_TIMEOUT_S = 5.0
+_TrainLoopInput = TrainInputs | _StopTrainInputs
 
 
 def precalculate_new_logprobs(
@@ -100,7 +100,7 @@ async def process_train_batch(
     packed_tensors: PackedTensors,
     config: types.TrainConfig,
     _config: dev.TrainConfig,
-    inputs_queue: asyncio.Queue[TrainInputs | _StopTrainInputs],
+    inputs_queue: asyncio.Queue[_TrainLoopInput],
     results_queue: asyncio.Queue[dict[str, float]],
     train_task: asyncio.Task[None],
     trainer: "GRPOTrainer",
@@ -224,7 +224,7 @@ class UnslothState:
     tokenizer: PreTrainedTokenizerBase
     peft_model: peft.peft_model.PeftModelForCausalLM
     trainer: GRPOTrainer
-    inputs_queue: asyncio.Queue[TrainInputs | _StopTrainInputs]
+    inputs_queue: asyncio.Queue[_TrainLoopInput]
     results_queue: asyncio.Queue[dict[str, float]]
     _is_offloaded: bool = False
     _pinned_buffers: dict[str, torch.Tensor] | None = None
@@ -336,44 +336,22 @@ def _next_lora_id(self) -> int:
         self._lora_id_counter += 1
         return self._lora_id_counter
 
-    def _request_train_task_stop(self) -> asyncio.Task[None] | None:
+    async def aclose(self) -> None:
         train_task = self._train_task
-        if train_task is None:
-            return None
-        if train_task.done():
-            return train_task
-
-        # `_state` is a cached_property. Read from __dict__ directly so shutdown
-        # does not instantiate the full trainer state solely to stop a task.
-        state = self.__dict__.get("_state")
-        if isinstance(state, UnslothState):
-            state.inputs_queue.put_nowait(_STOP_TRAIN_INPUT)
-        return train_task
-
-    async def _shutdown_train_task(self) -> None:
-        train_task = self._request_train_task_stop()
-        if train_task is None:
+        self._train_task = None
+        if train_task is None or train_task.done():
+            self.close()
             return
 
+        # `_state` is a cached_property. Read from __dict__ directly so
+        # closing does not instantiate trainer state only to stop a task.
+        state = self.__dict__.get("_state")
+        assert isinstance(state, UnslothState)
+        state.inputs_queue.put_nowait(_STOP_TRAIN_INPUT)
         try:
-            # Give the trainer loop time to consume the stop sentinel and exit
-            # normally before falling back to cancellation.
-            await asyncio.wait_for(
-                train_task, timeout=_TRAIN_TASK_GRACEFUL_SHUTDOWN_TIMEOUT_S
-            )
+            await asyncio.wait_for(train_task, timeout=_TRAIN_TASK_SHUTDOWN_TIMEOUT_S)
         except asyncio.TimeoutError:
             train_task.cancel()
-            try:
-                await asyncio.wait_for(train_task, timeout=_TRAIN_TASK_CANCEL_TIMEOUT_S)
-            except (asyncio.CancelledError, asyncio.TimeoutError):
-                pass
-        except asyncio.CancelledError:
-            pass
-        finally:
-            self._train_task = None
-
-    async def aclose(self) -> None:
-        await self._shutdown_train_task()
         self.close()
 
     # =========================================================================
@@ -500,7 +478,6 @@ async def _reload_adapter(self, checkpoint_path: str, step: int) -> None:
 
     def close(self) -> None:
         """Terminate vLLM subprocess if running."""
-        self._request_train_task_stop()
         if self._vllm_process is None:
             return
         self._vllm_process.terminate()
@@ -646,7 +623,7 @@ async def _train_dedicated(
 
         await self._state.results_queue.join()
 
-        if not hasattr(self, "_train_task") or self._train_task is None:
+        if self._train_task is None:
             self._train_task = asyncio.create_task(
                 train(
                     trainer=self._state.trainer,
@@ -736,7 +713,7 @@ async def _train_shared(
         await self._state.results_queue.join()
 
         # If we haven't already, start the training task
-        if not hasattr(self, "_train_task") or self._train_task is None:
+        if self._train_task is None:
             self._train_task = asyncio.create_task(
                 train(
                     trainer=self._state.trainer,
@@ -1032,12 +1009,12 @@ def _state(self) -> UnslothState:
             trainer.create_optimizer()
 
         # Initialize queues
-        inputs_queue: asyncio.Queue[TrainInputs | _StopTrainInputs] = asyncio.Queue()
+        inputs_queue: asyncio.Queue[_TrainLoopInput] = asyncio.Queue()
         results_queue: asyncio.Queue[dict[str, float]] = asyncio.Queue()
 
         # Patch trainer _prepare_inputs() to pull from queue
         def _async_prepare_inputs(*_: Any, **__: Any) -> dict[str, torch.Tensor]:
-            async def get_inputs() -> TrainInputs | _StopTrainInputs:
+            async def get_inputs() -> _TrainLoopInput:
                 return await inputs_queue.get()
 
             # Force otherwise synchronous _prepare_inputs() to yield
diff --git a/src/art/unsloth/train.py b/src/art/unsloth/train.py
@@ -78,10 +78,9 @@ async def train(
     if not is_train_dict:
         trainer._metrics = {"train": defaultdict(list)}
     try:
-        try:
-            trainer.train()
-        except StopTrainingLoop:
-            return
+        trainer.train()
+    except StopTrainingLoop:
+        return
     finally:
         trainer.compute_loss = _compute_loss
         trainer.log = _log  # ty:ignore[invalid-assignment]
diff --git a/tests/integration/test_pipeline_localbackend_dedicated.py b/tests/integration/test_pipeline_localbackend_dedicated.py
diff --git a/tests/unit/test_pipeline_trainer_local_backend.py b/tests/unit/test_pipeline_trainer_local_backend.py