fix: update step handling in LocalBackend and improve error handling

bradhilton · bradhilton · commit 59778f088f24 · 2026-01-22T20:35:11.000-07:00
- Changed the default step assignment in get_inference_name() to use a method for better accuracy.
- Enhanced error handling when importing UnslothService to prevent crashes if the module is not found.
- Refactored assertion for clarity in gradient step validation.
diff --git a/src/art/local/backend.py b/src/art/local/backend.py
@@ -128,9 +128,10 @@ def _model_inference_name(self, model: Model, step: int | None = None) -> str:
             step: If provided, returns name for specific checkpoint.
                   If None, returns name for latest checkpoint (step 0 initially).
         """
+
         # For LocalBackend, vLLM always serves LoRA adapters with @step suffix
         # Default to step 0 when not specified (the initial checkpoint created at registration)
-        actual_step = step if step is not None else 0
+        actual_step = step if step is not None else self.__get_step(model)
         return f"{model.name}@{actual_step}"
 
     async def _get_service(self, model: TrainableModel) -> ModelService:
@@ -573,12 +574,17 @@ async def _train_model(
                     f"Advanced step from {current_step} to {next_step} (no training occurred)"
                 )
 
-                # Register the renamed checkpoint as a new LoRA adapter
-                # so it's available for inference at the new step
-                from ..unsloth.service import UnslothService
+                try:
+                    # Register the renamed checkpoint as a new LoRA adapter
+                    # so it's available for inference at the new step
+                    from ..unsloth.service import UnslothService
 
-                if isinstance(service, UnslothService):
-                    await service.register_lora_for_step(next_step, next_checkpoint_dir)
+                    if isinstance(service, UnslothService):
+                        await service.register_lora_for_step(
+                            next_step, next_checkpoint_dir
+                        )
+                except ModuleNotFoundError:
+                    pass  # Unsloth is not installed
 
             # Yield metrics showing no groups were trainable
             # (the frontend will handle logging)
@@ -601,9 +607,9 @@ async def _train_model(
             num_gradient_steps = int(
                 result.pop("num_gradient_steps", estimated_gradient_steps)
             )
-            assert num_gradient_steps == estimated_gradient_steps, (
-                f"num_gradient_steps {num_gradient_steps} != estimated_gradient_steps {estimated_gradient_steps}"
-            )
+            assert (
+                num_gradient_steps == estimated_gradient_steps
+            ), f"num_gradient_steps {num_gradient_steps} != estimated_gradient_steps {estimated_gradient_steps}"
             results.append(result)
             yield {**result, "num_gradient_steps": num_gradient_steps}
             pbar.update(1)