Enhance Megatron model configuration and training index handling

bradhilton · bradhilton · commit f5d72b51b154 · 2026-02-24T20:35:17.000Z
- Added internal configuration for model registration in yes-no-maybe-megatron.py to optimize GPU memory utilization and tensor parallel size.
- Refactored index calculation in train.py to improve efficiency and handle cases where indices may be empty, ensuring robust data parallelism during training.
diff --git a/dev/yes-no-maybe-megatron.py b/dev/yes-no-maybe-megatron.py
@@ -4,6 +4,7 @@
 
 from dotenv import load_dotenv
 import openai
+import torch
 
 import art
 from art.megatron import MegatronBackend
@@ -43,6 +44,12 @@ async def main():
         name=os.environ.get("MODEL_NAME", "megatron-001"),
         project="yes-no-maybe-megatron",
         base_model=base_model,
+        _internal_config=art.dev.InternalModelConfig(
+            engine_args=art.dev.EngineArgs(
+                gpu_memory_utilization=0.8,
+                tensor_parallel_size=torch.cuda.device_count(),
+            ),
+        ),
     )
     await model.register(backend)
 
diff --git a/src/art/megatron/train.py b/src/art/megatron/train.py
@@ -8,6 +8,7 @@
 
 import gc
 import json
+import math
 import shutil
 import time
 from typing import Any, cast
@@ -213,18 +214,13 @@ def calculate_mask(
     num_sequences = job.disk_packed_tensors["num_sequences"]
     dp_rank = ps.get_data_parallel_rank()
     dp_world_size = ps.get_data_parallel_world_size()
-    indices = list(
-        range(
-            dp_rank,
-            num_sequences,
-            dp_world_size,
-        )
-    )
-    # pad indices
-    if num_sequences % dp_world_size <= dp_rank > 0:
-        indices.append(
-            (list(range(num_sequences)) * (dp_world_size // num_sequences + 1))[dp_rank]
-        )
+    num_indices = math.ceil(num_sequences / dp_world_size)
+    indices = list(range(dp_rank, num_sequences, dp_world_size))
+    if not indices:
+        indices = [dp_rank % num_sequences]
+    # pad indices by repeating & slicing to target length
+    repeat = math.ceil(num_indices / len(indices))
+    indices = (indices * repeat)[:num_indices]
     for index in indices:
         inputs = PackedTensors(  # type: ignore
             **{