Clean up unused adapters before saving checkpoint (#605)

angkywilliam · web-flow · commit aed1ee708629 · 2026-03-07T10:04:12.000-08:00
Removes reference adapters (loaded for KL/logprob computation) from the
PEFT model before saving, freeing GPU/CPU memory. Also adds a
gc_and_empty_cuda_cache call after saving.

Made-with: Cursor
diff --git a/src/art/unsloth/service.py b/src/art/unsloth/service.py
@@ -151,13 +151,42 @@ def save_checkpoint(
     verbose: bool = False,
 ) -> str:
     """Save a checkpoint and return the checkpoint directory path."""
+    # _use_adapter() may load reference adapters for KL/logprob computation and
+    # keep them attached to the PEFT model. Before saving, keep only active
+    # adapter(s) and drop the rest to release GPU/CPU memory.
+    try:
+        peft_model = trainer.accelerator.unwrap_model(  # type: ignore[attr-defined]
+            trainer.model, keep_fp32_wrapper=False
+        )
+        active_adapters = peft_model.active_adapter
+        if isinstance(active_adapters, str):
+            keep_adapters = {active_adapters}
+        else:
+            keep_adapters = set(active_adapters)
+
+        before_adapters = list(peft_model.peft_config.keys())
+        print(f"Adapters before cleanup: {before_adapters}")
+        print(f"Keeping active adapter(s): {sorted(keep_adapters)}")
+
+        for adapter_name in before_adapters:
+            if adapter_name not in keep_adapters:
+                peft_model.delete_adapter(adapter_name)
+                print(f"Deleted unused adapter: {adapter_name}")
+
+        after_adapters = list(peft_model.peft_config.keys())
+        print(f"Adapters after cleanup: {after_adapters}")
+    except Exception as e:
+        print(f"Warning: failed to cleanup unused adapters: {e}")
+
     if verbose:
         print("Saving new LoRA adapter...")
     next_step = get_step_from_dir(output_dir) + 1
     checkpoint_dir = get_step_checkpoint_dir(output_dir, next_step)
     os.makedirs(checkpoint_dir, exist_ok=True)
     trainer.save_model(checkpoint_dir)
     convert_checkpoint_if_needed(checkpoint_dir)
+
+    gc_and_empty_cuda_cache()
     return checkpoint_dir