OpenPipe
diff --git a/‎src/art/serverless/backend.py‎
Lines changed: 216 additions & 2 deletions b/‎src/art/serverless/backend.py‎
Lines changed: 216 additions & 2 deletions
diff --git a/‎src/art/tinker_native/backend.py‎
Lines changed: 137 additions & 0 deletions b/‎src/art/tinker_native/backend.py‎
Lines changed: 137 additions & 0 deletions
@@ -13,9 +13,22 @@
 from ..types import ServerlessTrainResult, TrainConfig
 
 if TYPE_CHECKING:
+    import wandb
+
     from ..model import Model, TrainableModel
 
 
+def _extract_step_from_wandb_artifact(artifact: "wandb.Artifact") -> int | None:
+    """Extract step number from a W&B artifact's aliases."""
+    for alias in artifact.aliases:
+        if alias.startswith("step"):
+            try:
+                return int(alias[4:])
+            except ValueError:
+                pass
+    return None
+
+
 class ServerlessBackend(Backend):
     def __init__(
         self, *, api_key: str | None = None, base_url: str | None = None
@@ -417,7 +430,58 @@ async def _experimental_push_to_s3(
         verbose: bool = False,
         delete: bool = False,
     ) -> None:
-        raise NotImplementedError
+        """Push model checkpoints from W&B artifacts to S3.
+
+        Downloads checkpoint(s) from W&B and uploads them to S3.
+
+        Args:
+            model: The model whose checkpoints to push.
+            s3_bucket: S3 bucket name. If None, uses BACKUP_BUCKET env var.
+            prefix: Optional S3 prefix path.
+            verbose: Whether to print verbose output.
+            delete: Whether to delete files from S3 that don't exist in source.
+        """
+        from art.utils.s3 import build_s3_path, ensure_bucket_exists, s3_sync
+
+        assert model.id is not None, "Model ID is required"
+
+        # Get all checkpoint steps
+        steps: list[int] = []
+        async for checkpoint in self._client.models.checkpoints.list(  # ty:ignore[possibly-missing-attribute]
+            model_id=model.id, order="asc"
+        ):
+            steps.append(checkpoint.step)
+
+        if not steps:
+            if verbose:
+                print("No checkpoints found to push.")
+            return
+
+        await ensure_bucket_exists(s3_bucket)
+
+        for step in steps:
+            if verbose:
+                print(f"Pushing checkpoint step {step} to S3...")
+
+            # Pull from W&B to local temp dir
+            checkpoint_dir = await self._experimental_pull_model_checkpoint(
+                model,  # type: ignore[arg-type]
+                step=step,
+                verbose=verbose,
+            )
+
+            # Push to S3
+            s3_path = build_s3_path(
+                model_name=model.name,
+                project=model.project,
+                step=step,
+                s3_bucket=s3_bucket,
+                prefix=prefix,
+            )
+            await s3_sync(checkpoint_dir, s3_path, verbose=verbose, delete=delete)
+
+        if verbose:
+            print(f"Successfully pushed {len(steps)} checkpoint(s) to S3.")
 
     async def _experimental_fork_checkpoint(
         self,
@@ -429,4 +493,154 @@ async def _experimental_fork_checkpoint(
         verbose: bool = False,
         prefix: str | None = None,
     ) -> None:
-        raise NotImplementedError
+        """Fork a checkpoint from another model to initialize this model.
+
+        Pulls the source checkpoint from W&B artifacts (or S3 if from_s3_bucket
+        is provided) and uploads it as a W&B artifact for the destination model.
+
+        Note: This uploads the artifact directly to W&B. The ServerlessBackend's
+        checkpoint tracking may not immediately reflect the forked checkpoint
+        until the next training step.
+
+        Args:
+            model: The destination model to fork to.
+            from_model: The name of the source model to fork from.
+            from_project: The project of the source model. Defaults to model.project.
+            from_s3_bucket: Optional S3 bucket to pull the checkpoint from.
+            not_after_step: If provided, uses the latest checkpoint <= this step.
+            verbose: Whether to print verbose output.
+            prefix: Optional S3 prefix for bucket operations.
+        """
+        import os
+        import tempfile
+
+        import wandb
+
+        from_project = from_project or model.project
+
+        if from_s3_bucket is not None:
+            # Pull from S3
+            from art.utils.s3 import build_s3_path, ensure_bucket_exists, s3_sync
+            from art.utils.s3_checkpoint_utils import (
+                get_checkpoint_step_not_after_from_s3,
+                get_latest_checkpoint_step_from_s3,
+            )
+
+            if not_after_step is None:
+                target_step = await get_latest_checkpoint_step_from_s3(
+                    model_name=from_model,
+                    project=from_project,
+                    s3_bucket=from_s3_bucket,
+                    prefix=prefix,
+                )
+            else:
+                target_step = await get_checkpoint_step_not_after_from_s3(
+                    model_name=from_model,
+                    project=from_project,
+                    not_after_step=not_after_step,
+                    s3_bucket=from_s3_bucket,
+                    prefix=prefix,
+                )
+
+            if target_step is None:
+                raise ValueError(
+                    f"No suitable checkpoint found in S3 for model {from_model}"
+                )
+
+            if verbose:
+                print(f"Pulling checkpoint step {target_step} from S3...")
+
+            checkpoint_dir = os.path.join(
+                tempfile.gettempdir(),
+                "art_fork_checkpoints",
+                from_project,
+                from_model,
+                f"{target_step:04d}",
+            )
+            os.makedirs(checkpoint_dir, exist_ok=True)
+
+            s3_path = build_s3_path(
+                model_name=from_model,
+                project=from_project,
+                step=target_step,
+                s3_bucket=from_s3_bucket,
+                prefix=prefix,
+            )
+            await ensure_bucket_exists(from_s3_bucket)
+            await s3_sync(s3_path, checkpoint_dir, verbose=verbose)
+            selected_step = target_step
+        else:
+            # Pull from W&B artifacts
+            api = wandb.Api(api_key=self._client.api_key)  # ty:ignore[possibly-missing-attribute]
+            from_entity = model.entity or api.default_entity
+
+            # Iterate all artifact versions to find the best step.
+            # We avoid relying on the W&B `:latest` alias because it
+            # may not correspond to the highest training step.
+            collection_path = f"{from_entity}/{from_project}/{from_model}"
+            versions = api.artifacts("lora", collection_path)
+
+            best_step: int | None = None
+            best_artifact = None
+            for version in versions:
+                step_num = _extract_step_from_wandb_artifact(version)
+                if step_num is None:
+                    continue
+                if not_after_step is not None and step_num > not_after_step:
+                    continue
+                if best_step is None or step_num > best_step:
+                    best_step = step_num
+                    best_artifact = version
+
+            if best_step is None or best_artifact is None:
+                if not_after_step is not None:
+                    raise ValueError(
+                        f"No checkpoints found not after step {not_after_step} "
+                        f"for model {from_model}"
+                    )
+                raise ValueError(f"No checkpoints found for model {from_model}")
+            selected_step = best_step
+            artifact = best_artifact
+
+            checkpoint_dir = os.path.join(
+                tempfile.gettempdir(),
+                "art_fork_checkpoints",
+                from_project,
+                from_model,
+                f"{selected_step:04d}" if selected_step is not None else "latest",
+            )
+            os.makedirs(checkpoint_dir, exist_ok=True)
+            artifact.download(root=checkpoint_dir)
+
+            if verbose:
+                print(f"Downloaded source checkpoint step {selected_step} from W&B")
+
+        # Upload as W&B artifact for the destination model
+        assert model.entity is not None, "Model entity is required"
+
+        if verbose:
+            print(f"Uploading forked checkpoint as W&B artifact for {model.name}...")
+
+        wandb.login(key=self._client.api_key)  # ty:ignore[possibly-missing-attribute]
+        run = wandb.init(
+            project=model.project,
+            entity=model.entity,
+            job_type="checkpoint-fork",
+            name=f"fork-{from_model}-to-{model.name}",
+            settings=wandb.Settings(silent=True),
+        )
+        assert run is not None
+
+        dest_artifact = wandb.Artifact(name=model.name, type="lora")
+        dest_artifact.add_dir(checkpoint_dir)
+        aliases = ["latest"]
+        if selected_step is not None:
+            aliases.insert(0, f"step{selected_step}")
+        run.log_artifact(dest_artifact, aliases=aliases)
+        run.finish()
+
+        if verbose:
+            print(
+                f"Successfully forked checkpoint from {from_model} "
+                f"(step {selected_step}) to {model.name}"
+            )
@@ -778,3 +778,140 @@ def _persist_model_state(self, model: TrainableModel, state: ModelState) -> None
                 STATE_KEY_LATEST_STEP: state.current_step,
             }
         )
+
+    async def _experimental_fork_checkpoint(
+        self,
+        model: Model,
+        from_model: str,
+        from_project: str | None = None,
+        from_s3_bucket: str | None = None,
+        not_after_step: int | None = None,
+        verbose: bool = False,
+        prefix: str | None = None,
+    ) -> None:
+        """Fork a checkpoint from another TinkerNative model to initialize this model.
+
+        Loads the source model's training checkpoint into the destination model's
+        training client directly via tinker:// paths. No local download needed.
+
+        Args:
+            model: The destination model to fork to (must already be registered).
+            from_model: The name of the source model to fork from.
+            from_project: The project of the source model. Defaults to model.project.
+            from_s3_bucket: Not supported for TinkerNativeBackend.
+            not_after_step: If provided, uses the latest checkpoint <= this step.
+            verbose: Whether to print verbose output.
+            prefix: Not applicable for TinkerNativeBackend.
+        """
+        if from_s3_bucket is not None:
+            raise NotImplementedError(
+                "from_s3_bucket is not supported for TinkerNativeBackend. "
+                "Tinker checkpoints are stored on Tinker infrastructure, not S3."
+            )
+
+        trainable_model = cast(TrainableModel, model)
+
+        if trainable_model.name not in self._model_state:
+            raise RuntimeError(
+                f"Model '{trainable_model.name}' is not registered. "
+                "Call register() before forking."
+            )
+
+        from_project = from_project or model.project
+
+        # Read the source model's state.json to get its tinker_run_ids
+        source_state_dir = get_model_dir(
+            Model(name=from_model, project=from_project),
+            art_path=self._path,
+        )
+        source_state_path = f"{source_state_dir}/state.json"
+        import json
+
+        if not os.path.exists(source_state_path):
+            raise FileNotFoundError(
+                f"Source model state not found at {source_state_path}. "
+                f"Ensure the source model '{from_model}' has been trained "
+                f"with this backend."
+            )
+        with open(source_state_path, "r") as f:
+            source_state = json.load(f)
+
+        source_run_ids = list(source_state.get(STATE_KEY_RUN_IDS, []))
+        if not source_run_ids:
+            raise ValueError(
+                f"Source model '{from_model}' has no tinker run IDs in its state."
+            )
+
+        # List source model's checkpoints
+        dest_state = self._model_state[trainable_model.name]
+        training_paths, sampler_paths = await self._list_checkpoints(
+            dest_state.rest_client, source_run_ids
+        )
+
+        if not training_paths:
+            raise ValueError(
+                f"No training checkpoints found for source model '{from_model}'."
+            )
+
+        # Select the target step
+        available_steps = sorted(training_paths.keys())
+        if not_after_step is not None:
+            eligible_steps = [s for s in available_steps if s <= not_after_step]
+            if not eligible_steps:
+                raise ValueError(
+                    f"No checkpoint found at or before step {not_after_step}. "
+                    f"Available steps: {available_steps}"
+                )
+            target_step = max(eligible_steps)
+        else:
+            target_step = max(available_steps)
+
+        source_checkpoint_path = training_paths[target_step]
+        if verbose:
+            print(
+                f"Forking from '{from_model}' step {target_step} "
+                f"(checkpoint: {source_checkpoint_path})"
+            )
+
+        # Load the source checkpoint into a new training client
+        config = self._resolve_model_config(trainable_model)
+        new_training_client = await self._create_training_client_from_checkpoint(
+            service_client=dest_state.service_client,
+            checkpoint_state_path=source_checkpoint_path,
+            base_model=trainable_model.base_model,
+            training_client_args=config.training_client_args,
+            reset_optimizer=True,
+        )
+
+        # Save new sampler weights
+        checkpoint_name = f"step_{target_step:06d}"
+        sampler_response = await self._save_sampler_weights(
+            new_training_client, checkpoint_name
+        )
+
+        # Create a sampler client from the new weights
+        sampler_client = await self._tinker_train_call(
+            "create_sampling_client_async",
+            new_training_client.create_sampling_client_async(
+                model_path=sampler_response.path
+            ),
+        )
+
+        # Update the destination model's state
+        new_run_id = new_training_client.model_id
+        if new_run_id not in dest_state.tinker_run_ids:
+            dest_state.tinker_run_ids.append(new_run_id)
+
+        dest_state.training_client = new_training_client
+        dest_state.current_step = target_step
+        dest_state.sampler_clients[target_step] = sampler_client
+        dest_state.sampler_checkpoint_paths[target_step] = sampler_response.path
+        dest_state.training_checkpoint_paths[target_step] = source_checkpoint_path
+
+        self._persist_model_state(trainable_model, dest_state)
+
+        if verbose:
+            print(
+                f"Fork complete. Model '{trainable_model.name}' is now at "
+                f"step {target_step}."
+            )