|
9 | 9 |
|
10 | 10 | import art |
11 | 11 | from art.tinker_native import TinkerNativeBackend |
| 12 | +from art.tinker_native.backend import _apply_kl_penalty |
| 13 | +from art.tinker_native.data import trajectory_groups_to_datums |
12 | 14 |
|
13 | 15 | DEFAULT_BASE_MODEL = "Qwen/Qwen3-30B-A3B-Instruct-2507" |
14 | 16 |
|
@@ -37,6 +39,8 @@ async def simple_rollout( |
37 | 39 | max_tokens=10, |
38 | 40 | timeout=60, |
39 | 41 | temperature=1, |
| 42 | + logprobs=True, |
| 43 | + top_logprobs=0, |
40 | 44 | ) |
41 | 45 | choice = chat_completion.choices[0] |
42 | 46 | content = (choice.message.content or "").lower() |
@@ -115,6 +119,85 @@ async def make_group(prompt: str) -> art.TrajectoryGroup: |
115 | 119 | await backend.close() |
116 | 120 |
|
117 | 121 |
|
| 122 | +@pytest.mark.skipif( |
| 123 | + "TINKER_API_KEY" not in os.environ, |
| 124 | + reason="TINKER_API_KEY not set - skipping TinkerNativeBackend KL test", |
| 125 | +) |
| 126 | +async def test_tinker_native_backend_kl_identity_metric(): |
| 127 | + model_name = f"test-tinker-native-kl-{uuid.uuid4().hex[:8]}" |
| 128 | + with tempfile.TemporaryDirectory() as tmpdir: |
| 129 | + backend = TinkerNativeBackend(path=tmpdir) |
| 130 | + model = art.TrainableModel( |
| 131 | + name=model_name, |
| 132 | + project="integration-tests", |
| 133 | + base_model=get_base_model(), |
| 134 | + ) |
| 135 | + try: |
| 136 | + await model.register(backend) |
| 137 | + |
| 138 | + openai_client = model.openai_client() |
| 139 | + current_step = await model.get_step() |
| 140 | + model_name_step = model.get_inference_name(step=current_step) |
| 141 | + prompts = ["Say yes", "Say no", "Say maybe"] |
| 142 | + |
| 143 | + async def make_group(prompt: str) -> art.TrajectoryGroup: |
| 144 | + import asyncio |
| 145 | + |
| 146 | + trajectories = await asyncio.gather( |
| 147 | + *[ |
| 148 | + simple_rollout(openai_client, model_name_step, prompt) |
| 149 | + for _ in range(2) |
| 150 | + ] |
| 151 | + ) |
| 152 | + return art.TrajectoryGroup(trajectories) # type: ignore[attr-defined] |
| 153 | + |
| 154 | + train_groups = await art.gather_trajectory_groups( # type: ignore[attr-defined] |
| 155 | + [make_group(prompt) for prompt in prompts] |
| 156 | + ) |
| 157 | + ensure_reward_variance(train_groups) |
| 158 | + |
| 159 | + state = backend._model_state[model.name] |
| 160 | + datums = trajectory_groups_to_datums( |
| 161 | + train_groups, |
| 162 | + state.renderer, |
| 163 | + state.tokenizer, |
| 164 | + ) |
| 165 | + assert datums |
| 166 | + |
| 167 | + reference_sampling_client = await backend._get_kl_reference_sampling_client( |
| 168 | + state, |
| 169 | + model.base_model, |
| 170 | + current_step, |
| 171 | + ) |
| 172 | + expected_kl = ( |
| 173 | + await _apply_kl_penalty( |
| 174 | + trajectory_groups_to_datums( |
| 175 | + train_groups, |
| 176 | + state.renderer, |
| 177 | + state.tokenizer, |
| 178 | + ), |
| 179 | + reference_sampling_client, |
| 180 | + kl_penalty_coef=0.25, |
| 181 | + ) |
| 182 | + )["loss/kl_policy_ref"] |
| 183 | + |
| 184 | + result = await backend.train( |
| 185 | + model, |
| 186 | + train_groups, |
| 187 | + learning_rate=1e-5, |
| 188 | + kl_penalty_coef=0.25, |
| 189 | + kl_penalty_reference_step=current_step, |
| 190 | + ) |
| 191 | + |
| 192 | + assert result.metrics["loss/kl_policy_ref"] == pytest.approx( |
| 193 | + expected_kl, |
| 194 | + abs=0.05, |
| 195 | + ) |
| 196 | + assert result.metrics["loss/kl_policy_ref"] == pytest.approx(0.0, abs=0.05) |
| 197 | + finally: |
| 198 | + await backend.close() |
| 199 | + |
| 200 | + |
118 | 201 | @pytest.mark.skipif( |
119 | 202 | "TINKER_API_KEY" not in os.environ, |
120 | 203 | reason="TINKER_API_KEY not set - skipping TinkerNativeBackend fork test", |
|
0 commit comments