Flex Attention for Local Megatron Backend (#586)

FurtherAI · web-flow · commit 0b1ee70c1e07 · 2026-03-02T20:01:59.000-06:00
* megatron: replace fused core attention with compiled flex attention

* Cleanup after codex and minimize a bit.

* Fix bug where Megatron training holds old LoRAs and set Triton/TorchInductor caches explicitly.
diff --git a/src/art/megatron/flex_attention.py b/src/art/megatron/flex_attention.py
@@ -0,0 +1,209 @@
+"""Flex attention plumbing for ART's Megatron backend."""
+
+import math
+from typing import Any, ClassVar, cast
+
+from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core.process_groups_config import ProcessGroupCollection
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import divide
+from pydantic import BaseModel, ConfigDict
+import torch
+from torch import Tensor
+from torch.nn.attention.flex_attention import (
+    BlockMask,
+    create_block_mask,
+    flex_attention,
+)
+
+
+class SharedPrefixAttentionState(BaseModel):
+    """Shared-prefix sparsity metadata for one packed ART training sample."""
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    block_mask: BlockMask
+
+
+class FlexAttentionWrapper(torch.nn.Module):
+    """Compiled `flex_attention` wrapper with Torchtitan-style inductor options."""
+
+    # Torchtitan inductor options for compiling flex attention.
+    _compile_options = {
+        "max_autotune": True,
+        "coordinate_descent_tuning": True,
+        "triton.cudagraphs": False,
+    }
+    _compiled_flex_attention: ClassVar = torch.compile(
+        flex_attention,
+        options=_compile_options,
+    )
+
+    def forward(
+        self,
+        q: Tensor,
+        k: Tensor,
+        v: Tensor,
+        *,
+        block_mask: BlockMask,
+        scale: float,
+        enable_gqa: bool,
+    ) -> Tensor:
+        # q, k, v are [B, H, S, D] tensors expected by torch.flex_attention.
+        return cast(
+            Tensor,
+            FlexAttentionWrapper._compiled_flex_attention(
+                q,
+                k,
+                v,
+                block_mask=block_mask,
+                scale=scale,
+                enable_gqa=enable_gqa,
+            ),
+        )
+
+
+_compiled_create_block_mask = torch.compile(create_block_mask)
+
+
+def create_shared_prefix_attention_state(
+    group_ids: Tensor,
+    parent_ids: Tensor,
+) -> SharedPrefixAttentionState:
+    """Build a compiled block mask for ART shared-prefix packing.
+
+    Initialized on the device of the group_ids tensor.
+
+    Args:
+        group_ids: `[B, S]` group id for each token in a packed sequence.
+        parent_ids: `[B, S]` parent group id for each token in a packed sequence.
+    """
+
+    def _shared_prefix_mask(
+        batch_idx: Tensor,
+        head_idx: Tensor,
+        query_idx: Tensor,
+        kv_idx: Tensor,
+    ) -> Tensor:
+        del head_idx
+        # Token q can attend token k if k is causal and either from the same
+        # traj (traj -> traj)/within the shared prefix (prefix -> prefix) (same_group)
+        # or from the prefix which q uses (traj -> prefix) (parent_prefix).
+        same_group = group_ids[batch_idx, query_idx] == group_ids[batch_idx, kv_idx]
+        parent_prefix = parent_ids[batch_idx, query_idx] == group_ids[batch_idx, kv_idx]
+        return (query_idx >= kv_idx) & (same_group | parent_prefix)
+
+    block_mask = _compiled_create_block_mask(
+        _shared_prefix_mask,
+        group_ids.shape[0],
+        None,
+        group_ids.shape[1],
+        group_ids.shape[1],
+        device=group_ids.device,
+    )
+    return SharedPrefixAttentionState(block_mask=block_mask)
+
+
+class FlexDotProductAttention(torch.nn.Module):
+    """Megatron core-attention module backed by compiled torch flex attention.
+
+    The current implementation lacks support for fp8 and context parallelism (which are available in TEDotProductAttention)
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        layer_number: int,
+        attn_mask_type: AttnMaskType,
+        attention_type: str,
+        attention_dropout: float | None = None,
+        softmax_scale: float | None = None,
+        cp_comm_type: str | None = None,
+        pg_collection: ProcessGroupCollection | None = None,
+    ):
+        super().__init__()
+        del (
+            layer_number,
+            attn_mask_type,
+            attention_type,
+            attention_dropout,
+            cp_comm_type,
+        )
+        self.config = config
+        self.flex_attention = FlexAttentionWrapper()
+
+        if pg_collection is None:
+            tp_world_size = self.config.tensor_model_parallel_size
+        else:
+            tp_world_size = pg_collection.tp.size()
+
+        kv_channels = self.config.kv_channels
+        assert kv_channels is not None, "Megatron config must provide kv_channels."
+        projection_size = kv_channels * self.config.num_attention_heads
+        self.hidden_size_per_partition = divide(projection_size, tp_world_size)
+        num_query_groups = (
+            self.config.num_query_groups or self.config.num_attention_heads
+        )
+        self.num_attention_heads_per_partition = divide(
+            self.config.num_attention_heads, tp_world_size
+        )
+        self.num_query_groups_per_partition = divide(num_query_groups, tp_world_size)
+
+        if softmax_scale is None:
+            head_dim = divide(projection_size, self.config.num_attention_heads)
+            self.softmax_scale = 1.0 / math.sqrt(head_dim)
+        else:
+            self.softmax_scale = softmax_scale
+
+    def forward(
+        self,
+        query: Tensor,
+        key: Tensor,
+        value: Tensor,
+        attention_mask: Tensor,
+        attn_mask_type: AttnMaskType | None = None,
+        attention_bias: Any = None,
+        packed_seq_params: PackedSeqParams | None = None,
+    ) -> Tensor:
+        """Compute self attention with compiled flex kernels.
+
+        Args:
+            query: `[S, B, Hq, D]`
+            key: `[S, B, Hkv, D]`
+            value: `[S, B, Hkv, D]`
+            attention_mask: unused placeholder tensor kept for Megatron checkpoint API.
+            attention_bias: `SharedPrefixAttentionState` or `BlockMask`.
+        """
+
+        del attention_mask, attn_mask_type
+        assert packed_seq_params is None, (
+            "PackedSeqParams is not used in ART Megatron flex path."
+        )
+
+        if isinstance(attention_bias, SharedPrefixAttentionState):
+            block_mask = attention_bias.block_mask
+        else:
+            assert isinstance(attention_bias, BlockMask), (
+                "Expected a flex BlockMask in attention_bias."
+            )
+            block_mask = attention_bias
+
+        # Megatron uses [S, B, H, D], while flex attention expects [B, H, S, D].
+        q = query.permute(1, 2, 0, 3)
+        k = key.permute(1, 2, 0, 3)
+        v = value.permute(1, 2, 0, 3)
+
+        out = self.flex_attention(
+            q,
+            k,
+            v,
+            block_mask=block_mask,
+            scale=self.softmax_scale,
+            enable_gqa=self.num_attention_heads_per_partition
+            != self.num_query_groups_per_partition,
+        )
+
+        # Return to Megatron's expected layout [S, B, Hq*D].
+        out = out.permute(2, 0, 1, 3).contiguous()
+        out = out.view(out.size(0), out.size(1), self.hidden_size_per_partition)
+        return out
diff --git a/src/art/megatron/provider.py b/src/art/megatron/provider.py
@@ -1,9 +1,32 @@
+import copy
+from functools import partial
+import inspect
+from typing import Callable
+
 from megatron.bridge import AutoBridge
 from megatron.bridge.models.gpt_provider import GPTModelProvider
 from megatron.bridge.models.qwen.qwen3_moe_bridge import Qwen3MoEBridge
 from megatron.core.transformer.enums import AttnBackend
+from megatron.core.transformer.spec_utils import ModuleSpec
 import torch
 
+from art.megatron.flex_attention import FlexDotProductAttention
+
+
+def _resolve_layer_spec(
+    base_layer_spec: ModuleSpec | Callable[[GPTModelProvider], ModuleSpec],
+    config: GPTModelProvider,
+    vp_stage: int | None = None,
+) -> ModuleSpec:
+    if isinstance(base_layer_spec, ModuleSpec):
+        return copy.deepcopy(base_layer_spec)
+    kwargs = (
+        {"vp_stage": vp_stage}
+        if vp_stage in inspect.signature(base_layer_spec).parameters
+        else {}
+    )
+    return base_layer_spec(config, **kwargs)
+
 
 def get_provider(model: str) -> GPTModelProvider:
     bridge = AutoBridge.from_hf_pretrained(
@@ -15,7 +38,20 @@ def get_provider(model: str) -> GPTModelProvider:
         "Only Qwen3 MoE models are supported"
     )
     provider = bridge.to_megatron_provider()
-    provider.attention_backend = AttnBackend.fused
+    base_layer_spec = provider.transformer_layer_spec
+
+    def _flex_attention_layer_spec(
+        config: GPTModelProvider, vp_stage: int | None = None
+    ) -> ModuleSpec:
+        layer_spec = _resolve_layer_spec(base_layer_spec, config, vp_stage)
+        # Keep Megatron's standard layer stack and replace only core attention.
+        layer_spec.submodules.self_attention.submodules.core_attention = (  # ty: ignore[unresolved-attribute]
+            FlexDotProductAttention
+        )
+        return layer_spec
+
+    provider.transformer_layer_spec = _flex_attention_layer_spec
+    provider.attention_backend = AttnBackend.auto
     provider.recompute_granularity = "full"
     provider.recompute_method = "uniform"
     provider.recompute_num_layers = 1
diff --git a/src/art/megatron/train.py b/src/art/megatron/train.py
@@ -1,9 +1,18 @@
 # isort: off
 import os
 
+
+def _set_cache_dir(env_var: str, default_path: str) -> None:
+    if not os.environ.get(env_var):
+        os.environ[env_var] = os.path.expanduser(default_path)
+    os.makedirs(os.environ[env_var], exist_ok=True)
+
+
 os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 os.environ["TORCH_CUDA_ARCH_LIST"] = "9.0"
+_set_cache_dir("TORCHINDUCTOR_CACHE_DIR", "~/.cache/torchinductor")
+_set_cache_dir("TRITON_CACHE_DIR", "~/.triton/cache")
 # isort: on
 
 import gc
@@ -21,9 +30,11 @@
 from pydantic import BaseModel
 from safetensors.torch import load_file, save_file
 import torch
+from torch._inductor.runtime.cache_dir_utils import cache_dir as inductor_cache_dir
 
 from art import dev, types
 from art.loss import loss_fn, shift_tensor
+from art.megatron.flex_attention import create_shared_prefix_attention_state
 from art.megatron.lora import apply_lora_adapters
 from art.megatron.offload import OffloadState, offload_to_cpu, reload_to_gpu
 from art.megatron.provider import get_provider
@@ -55,6 +66,11 @@ def freeze_model(model_chunks: list[MegatronModule]) -> list[MegatronModule]:
 rank = torch.distributed.get_rank()
 world_size = torch.distributed.get_world_size()
 
+if rank == 0:
+    print("TORCHINDUCTOR_CACHE_DIR:", os.environ["TORCHINDUCTOR_CACHE_DIR"])
+    print("Resolved inductor cache_dir():", inductor_cache_dir())
+    print("TRITON_CACHE_DIR:", os.environ["TRITON_CACHE_DIR"])
+
 for module in model:
     while not isinstance(module, GPTModel) and hasattr(module, "module"):
         module = module.module
@@ -122,31 +138,6 @@ def print0(*values: Any) -> None:
 offload_state = OffloadState()
 
 
-def calculate_mask(
-    batch_size: int,
-    seq_len: int,
-    device: torch.device,
-    group_ids: torch.Tensor,
-    parent_ids: torch.Tensor,
-) -> torch.Tensor:
-    causal_mask = (
-        torch.tril(
-            torch.ones(
-                seq_len,
-                seq_len,
-                dtype=torch.bool,
-                device=device,
-            )
-        )
-        .unsqueeze(0)
-        .expand(batch_size, seq_len, seq_len)
-    )
-    group_mask = group_ids.unsqueeze(2) == group_ids.unsqueeze(1)
-    parent_mask = parent_ids.unsqueeze(2) == group_ids.unsqueeze(1)
-    mask = causal_mask & (group_mask | parent_mask)
-    return mask
-
-
 offload_to_cpu(model, optimizer, rank, offload_state)
 
 while True:
@@ -236,26 +227,19 @@ def calculate_mask(
         for key, value in inputs.items():
             if isinstance(value, torch.Tensor):
                 inputs[key] = value.to(device)  # type: ignore
-        attention_mask = ~calculate_mask(
-            batch_size=inputs["tokens"].shape[0],
-            seq_len=inputs["tokens"].shape[1],
-            device=device,
+        attention_state = create_shared_prefix_attention_state(  # should happen after group_ids is moved to device
             group_ids=inputs["group_ids"],
             parent_ids=inputs["parent_ids"],
-        ).unsqueeze(1)  # add head dimension [B, H=1, S, S]
-        attention_bias = torch.where(
-            attention_mask,
-            torch.tensor(
-                float("-inf"), dtype=next(model[0].parameters()).dtype, device=device
-            ),
-            torch.tensor(0.0, dtype=next(model[0].parameters()).dtype, device=device),
         )
+        # Megatron full-layer recompute saves positional tensor args, so keep a tiny
+        # placeholder Tensor here and pass flex BlockMask state via attention_bias.
+        attention_mask = torch.zeros((1, 1, 1, 1), dtype=torch.bool, device=device)
         new_logprobs: torch.Tensor = -model[0](
             input_ids=inputs["tokens"],
             position_ids=inputs["input_pos"],
             attention_mask=attention_mask,
             labels=shift_tensor(inputs["tokens"], 0),
-            extra_block_kwargs={"attention_bias": attention_bias},
+            extra_block_kwargs={"attention_bias": attention_state},
         )
         loss = loss_fn(
             inputs,  # type: ignore
@@ -332,9 +316,11 @@ def calculate_mask(
     offload_to_cpu(model, optimizer, rank, offload_state)
     # Release mmap-backed packed tensor references on all ranks before rank0 cleanup.
     del packed_tensors
+    del adapter_model
     if "inputs" in locals():
         del inputs
     gc.collect()
+    torch.cuda.empty_cache()
     # Ensure all ranks have finished saving before signaling completion
     torch.distributed.barrier()
     if rank == 0: