feat: support sequence parallel with fused_add_rms_norm

Browse files

Files changed (7) hide show

tests/test_fused_add_rms_norm_sequence_parallel.py +176 -0
tests/test_rms_norm_sequence_parallel.py +1 -1
torch-ext/activation/__init__.py +2 -1
torch-ext/activation/fused_add_rms_norm_meta.py +199 -0
torch-ext/activation/parallel_style.py +50 -0
torch-ext/activation/rms_norm.py +4 -2
torch-ext/activation/rms_norm_meta.py +26 -17

tests/test_fused_add_rms_norm_sequence_parallel.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import random
+import sys
+from collections.abc import Sequence
+import pytest
+import torch
+import torch.distributed as dist
+from packaging import version
+from torch.distributed.tensor.placement_types import (Partial, Placement,
+                                                      Replicate, Shard)
+import activation
+from .utils import assert_close, opcheck
+DTYPES = [torch.float32]
+NUM_TOKENS = [512]  # Arbitrary values for testing
+SEQUENCE_DIMS = [0, 1]  # 0 is for [T, D] (packed), 1 is for [B, S, D]
+D = [16]  # Arbitrary values for testing
+SEEDS = [0]
+from activation.parallel_style import ResidualSequenceParallel
+from torch.distributed._tensor import DTensor
+from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
+from torch.distributed.tensor.parallel import parallelize_module
+@pytest.fixture(scope="session", autouse=True)
+def init_dist(request):
+    if version.parse(torch.__version__) < version.parse("2.8"):
+        pytest.skip("torch>=2.8.0 is required for sequence parallel")
+        return
+    try:
+        dist.init_process_group(backend="nccl")
+        torch.cuda.set_device(dist.get_rank() % torch.cuda.device_count())
+    except Exception as e:
+        print(f"Failed to initialize torch.distributed: {e}")
+        pytest.skip("Failed to initialize torch.distributed")
+    if dist.get_world_size() < 2:
+        pytest.skip("Need at least 2 processes in dist group. "
+                    "You can run with `torchrun --nproc-per-node=2 "
+                    "--local-ranks-filter 0 -m pytest "
+                    "test_rms_norm_sequence_parallel.py`")
+    yield
+    dist.destroy_process_group()
+class Model(torch.nn.Module):
+    def __init__(self, num_tokens, d) -> None:
+        super().__init__()
+        self.fused_add_rms_norm = activation.layers.FusedAddRMSNorm(d)
+    def forward(self, x: torch.Tensor, residual: torch.Tensor) -> torch.Tensor:
+        return self.fused_add_rms_norm(x, residual=residual)
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("d", D)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("sequence_dim", SEQUENCE_DIMS)
+@pytest.mark.parametrize("x_requires_grad", [True, False])
+@pytest.mark.parametrize("residual_requires_grad", [True, False])
+def test_fused_add_rms_norm_sequence_parallel(
+    num_tokens: int,
+    d: int,
+    dtype: torch.dtype,
+    seed: int,
+    sequence_dim: int,
+    x_requires_grad: bool,
+    residual_requires_grad: bool,
+) -> None:
+    if num_tokens % dist.get_world_size() != 0:
+        # It hangs at `y.full_tensor()` if not divisible
+        pytest.skip("num_tokens must be divisible by world_size for sharding")
+    if not x_requires_grad and not residual_requires_grad:
+        pytest.skip("For now, at least one of x or residual must require grad")
+    random.seed(seed)
+    torch.manual_seed(seed)
+    num_ranks = dist.get_world_size()
+    rank = dist.get_rank()
+    mesh = init_device_mesh("cuda", (num_ranks, ), mesh_dim_names=("shard", ))
+    match sequence_dim:
+        case 0:
+            x_shape = (num_tokens, d)
+        case 1:
+            BATCH_SIZE = 2
+            x_shape = (BATCH_SIZE, num_tokens, d)
+        case _:
+            raise ValueError(f"Invalid sequence_dim: {sequence_dim}")
+    x = torch.randn(x_shape, dtype=dtype, requires_grad=x_requires_grad).cuda()
+    residual = torch.randn(x_shape,
+                           dtype=dtype,
+                           requires_grad=residual_requires_grad).cuda()
+    weight = torch.ones(d, dtype=dtype, requires_grad=True).cuda()
+    eps = 1e-05
+    if x_requires_grad:
+        x.retain_grad()
+    if residual_requires_grad:
+        residual.retain_grad()
+    weight.retain_grad()
+    # Copy x, weight for reference
+    x_ref = x.detach().clone().requires_grad_(True)
+    residual_ref = residual.detach().clone().requires_grad_(True)
+    weight_ref = weight.detach().clone().requires_grad_(True)
+    model_sharded = Model(num_tokens, d).to(dtype=dtype).cuda()
+    model_sharded.fused_add_rms_norm.weight = torch.nn.Parameter(weight)
+    parallelize_module(model_sharded, mesh, {
+        "fused_add_rms_norm":
+        ResidualSequenceParallel(sequence_dim=sequence_dim)
+    })
+    x_sharded = DTensor.from_local(
+        x.chunk(num_ranks, dim=sequence_dim)[rank].contiguous(),
+        placements=(Shard(sequence_dim), ),
+        device_mesh=mesh,
+    )
+    residual_sharded = DTensor.from_local(
+        residual.chunk(num_ranks, dim=sequence_dim)[rank].contiguous(),
+        placements=(Shard(sequence_dim), ),
+        device_mesh=mesh,
+    )
+    y, add_output = model_sharded(x_sharded, residual_sharded)
+    y_from_sharded = y.full_tensor()
+    add_output_from_sharded = add_output.full_tensor()
+    model_unsharded = Model(num_tokens, d).to(dtype=dtype).cuda()
+    model_unsharded.fused_add_rms_norm.weight = torch.nn.Parameter(weight_ref)
+    y_from_unsharded, add_output_from_unsharded = model_unsharded(
+        x_ref, residual_ref)
+    assert_close(y_from_sharded, y_from_unsharded)
+    assert_close(add_output_from_sharded, add_output_from_unsharded)
+    # Backward
+    y_grad = torch.randn_like(y_from_unsharded)
+    add_output_grad = torch.randn_like(add_output_from_unsharded)
+    (y_grad * y_from_sharded +
+     add_output_grad * add_output_from_sharded).sum().backward()
+    (y_grad * y_from_unsharded +
+     add_output_grad * add_output_from_unsharded).sum().backward()
+    weight_grad_from_sharded = model_sharded.fused_add_rms_norm.weight.grad._local_tensor
+    weight_grad_from_unsharded = model_unsharded.fused_add_rms_norm.weight.grad
+    assert (x.grad is None) ^ x_requires_grad
+    assert (residual.grad is None) ^ residual_requires_grad
+    torch.distributed.all_reduce(weight_grad_from_sharded,
+                                 op=torch.distributed.ReduceOp.SUM)
+    if x.grad is not None:
+        torch.distributed.all_reduce(x.grad, op=torch.distributed.ReduceOp.SUM)
+        assert_close(x.grad, x_ref.grad)
+    if residual.grad is not None:
+        torch.distributed.all_reduce(residual.grad,
+                                     op=torch.distributed.ReduceOp.SUM)
+        assert_close(residual.grad, residual_ref.grad)
+    assert_close(weight_grad_from_sharded, weight_grad_from_unsharded)

tests/test_rms_norm_sequence_parallel.py CHANGED Viewed

@@ -63,7 +63,7 @@ class Model(torch.nn.Module):
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("sequence_dim", SEQUENCE_DIMS)
-def test_rms_norm(
     num_tokens: int,
     d: int,
     dtype: torch.dtype,

 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("sequence_dim", SEQUENCE_DIMS)
+def test_rms_norm_sequence_parallel(
     num_tokens: int,
     d: int,
     dtype: torch.dtype,

torch-ext/activation/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import torch
-from . import layers
 from ._ops import ops
 from .poly_norm import FusedMulPolyNormFunction, PolyNormFunction
 from .rms_norm import FusedAddRMSNormFunction, RMSNormFunction
@@ -48,5 +48,6 @@ __all__ = [
     "rms_norm",
     "fused_add_rms_norm",
     "layers",
     "ops",
 ]

 import torch
+from . import layers, parallel_style
 from ._ops import ops
 from .poly_norm import FusedMulPolyNormFunction, PolyNormFunction
 from .rms_norm import FusedAddRMSNormFunction, RMSNormFunction
     "rms_norm",
     "fused_add_rms_norm",
     "layers",
+    "parallel_style",
     "ops",
 ]

torch-ext/activation/fused_add_rms_norm_meta.py ADDED Viewed

	@@ -0,0 +1,199 @@

+from collections.abc import Sequence
+import torch
+from torch.distributed.tensor._dtensor_spec import DTensorSpec
+from torch.distributed.tensor._op_schema import (OpSchema, OpSpec, OpStrategy,
+                                                 RuntimeSchemaInfo)
+from torch.distributed.tensor._ops.utils import (generate_redistribute_costs,
+                                                 register_op_strategy)
+from torch.distributed.tensor.placement_types import (Placement, Replicate,
+                                                      Shard)
+from ._ops import ops
+def register_fused_add_rms_norm_meta():
+    """Dummy function to register the meta functions.
+    Registration happens at import time by the decorators below.
+    """
+    pass
+def _replicate_dims_start_at(placements: Sequence[Placement],
+                             start_dim: int = 0) -> tuple[Placement, ...]:
+    new_placements: list[Placement] = []
+    for p in placements:
+        if p.is_partial() or (isinstance(p, Shard) and p.dim >= start_dim):
+            new_placements.append(Replicate())  # make it replicate
+        else:
+            new_placements.append(p)  # keep the placement
+    return tuple(new_placements)
+@register_op_strategy(ops.fused_add_rms_norm.default,
+                      schema_info=RuntimeSchemaInfo(1))
+def fused_add_rms_norm_strategy(op_schema: OpSchema) -> OpStrategy:
+    mesh = op_schema.get_mesh_from_args()
+    assert len(op_schema.args_schema) == 4
+    (
+        input_strategy,
+        residual_strategy,
+        weight_strategy,
+        _,  # eps
+    ) = op_schema.args_schema
+    assert isinstance(input_strategy, OpStrategy)
+    assert isinstance(residual_strategy, OpStrategy)
+    assert isinstance(weight_strategy, OpStrategy)
+    lengths = {
+        "input": len(input_strategy.strategies),
+        "residual": len(residual_strategy.strategies),
+        "weight": len(weight_strategy.strategies),
+    }
+    assert len(set(
+        lengths.values())) == 1, f"Strategy length mismatch: {lengths}"
+    last_dim = input_strategy.ndim - 1
+    strategy = OpStrategy([])
+    for input, residual, weight in zip(input_strategy.strategies,
+                                       residual_strategy.strategies,
+                                       weight_strategy.strategies):
+        input_src = input.output_spec
+        residual_src = residual.output_spec
+        weight_src = weight.output_spec
+        assert isinstance(input_src, DTensorSpec)
+        assert isinstance(residual_src, DTensorSpec)
+        assert isinstance(weight_src, DTensorSpec)
+        redistribute_costs = []
+        # Input can be sharded in any dim except the last dim.
+        input_tgt = DTensorSpec(
+            mesh=mesh,
+            placements=_replicate_dims_start_at(input_src.placements,
+                                                last_dim),
+            tensor_meta=input_src.tensor_meta,
+        )
+        redistribute_costs.append(
+            generate_redistribute_costs(input_strategy, input_tgt))
+        # Residual add must have the same sharding as input.
+        residual_tgt = input_tgt
+        redistribute_costs.append(
+            generate_redistribute_costs(residual_strategy, residual_tgt))
+        # Weight cannot be sharded, so always replicate it.
+        weight_tgt = DTensorSpec(
+            mesh=mesh,
+            placements=(Replicate(), ),
+            tensor_meta=weight_src.tensor_meta,
+        )
+        redistribute_costs.append(
+            generate_redistribute_costs(weight_strategy, weight_tgt))
+        strategy.strategies.append(
+            OpSpec(
+                output_specs=[input_tgt, input_tgt],
+                input_specs=[input_tgt, residual_tgt, weight_tgt],
+                redistribute_cost=redistribute_costs,
+            ))
+    return strategy
+@register_op_strategy(ops.fused_add_rms_norm_backward.default,
+                      schema_info=RuntimeSchemaInfo(2))
+def fused_add_rms_norm_backward_strategy(op_schema: OpSchema) -> OpStrategy:
+    mesh = op_schema.get_mesh_from_args()
+    assert len(op_schema.args_schema) == 6
+    (
+        output_grad_strategy,
+        add_output_grad_strategy,
+        add_output_strategy,
+        weight_strategy,
+        _,  # eps
+        need_input_grad,  # need_input_grad
+    ) = op_schema.args_schema
+    assert isinstance(output_grad_strategy, OpStrategy)
+    assert isinstance(add_output_grad_strategy, OpStrategy)
+    assert isinstance(add_output_strategy, OpStrategy)
+    assert isinstance(weight_strategy, OpStrategy)
+    lengths = {
+        "output_grad": len(output_grad_strategy.strategies),
+        "add_output_grad": len(add_output_grad_strategy.strategies),
+        "add_output": len(add_output_strategy.strategies),
+        "weight": len(weight_strategy.strategies),
+    }
+    assert len(set(
+        lengths.values())) == 1, f"Strategy length mismatch: {lengths}"
+    zipped = zip(
+        output_grad_strategy.strategies,
+        add_output_grad_strategy.strategies,
+        add_output_strategy.strategies,
+        weight_strategy.strategies,
+    )
+    last_dim = output_grad_strategy.ndim - 1
+    strategy = OpStrategy([])
+    for output_grad, add_output_grad, add_output, weight in zipped:
+        output_grad_src = output_grad.output_spec
+        add_output_grad_src = add_output_grad.output_spec
+        add_output_src = add_output.output_spec
+        weight_src = weight.output_spec
+        assert isinstance(output_grad_src, DTensorSpec)
+        assert isinstance(add_output_grad_src, DTensorSpec)
+        assert isinstance(add_output_src, DTensorSpec)
+        assert isinstance(weight_src, DTensorSpec)
+        redistribute_costs = []
+        # output grad can be sharded in any dim except the last dim.
+        output_grad_tgt = DTensorSpec(
+            mesh=mesh,
+            placements=_replicate_dims_start_at(output_grad_src.placements,
+                                                last_dim),
+            tensor_meta=output_grad_src.tensor_meta,
+        )
+        redistribute_costs.append(
+            generate_redistribute_costs(output_grad_strategy, output_grad_tgt))
+        # add_output_grad must have the same sharding as output_grad.
+        add_output_grad_tgt = output_grad_tgt
+        redistribute_costs.append(
+            generate_redistribute_costs(add_output_grad_strategy,
+                                        add_output_grad_tgt))
+        # add_output must have the same sharding as output_grad.
+        add_output_tgt = output_grad_tgt
+        redistribute_costs.append(
+            generate_redistribute_costs(add_output_strategy, add_output_tgt))
+        # Weight cannot be sharded, so always replicate it.
+        weight_tgt = DTensorSpec(
+            mesh=mesh,
+            placements=(Replicate(), ),
+            tensor_meta=weight_src.tensor_meta,
+        )
+        redistribute_costs.append(
+            generate_redistribute_costs(weight_strategy, weight_tgt))
+        strategy.strategies.append(
+            OpSpec(
+                output_specs=[
+                    output_grad_tgt if need_input_grad else None, weight_tgt
+                ],
+                input_specs=[
+                    output_grad_tgt, add_output_grad_tgt, add_output_tgt,
+                    weight_tgt
+                ],
+                redistribute_cost=redistribute_costs,
+            ))
+    return strategy

torch-ext/activation/parallel_style.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from abc import ABC, abstractmethod
+from functools import partial
+from typing import Any, Optional, Union
+import torch
+import torch.nn as nn
+from torch.distributed.tensor import (DeviceMesh, DTensor, Replicate, Shard,
+                                      distribute_module, distribute_tensor)
+from torch.distributed.tensor.parallel import SequenceParallel
+from torch.distributed.tensor.placement_types import Placement
+class ResidualSequenceParallel(SequenceParallel):
+    """ Consider the case where we have a residual connection across a sequence parallel layer."""
+    @staticmethod
+    def _prepare_input_fn(sequence_sharding, mod, inputs, device_mesh):
+        input_tensor = inputs[0]
+        residual_tensor = inputs[1]
+        assert isinstance(input_tensor,
+                          DTensor) == isinstance(residual_tensor, DTensor)
+        assert isinstance(input_tensor,
+                          torch.Tensor) == isinstance(residual_tensor,
+                                                      torch.Tensor)
+        if isinstance(input_tensor, DTensor):
+            # if the passed in input DTensor is not sharded on the sequence dim, we need to redistribute it
+            if input_tensor.placements != sequence_sharding:
+                input_tensor = input_tensor.redistribute(
+                    placements=sequence_sharding, async_op=True)
+            if residual_tensor.placements != sequence_sharding:
+                residual_tensor = residual_tensor.redistribute(
+                    placements=sequence_sharding, async_op=True)
+            return input_tensor, residual_tensor
+        elif isinstance(input_tensor, torch.Tensor):
+            # assume the input passed in already sharded on the sequence dim and create the DTensor
+            return DTensor.from_local(input_tensor,
+                                      device_mesh,
+                                      sequence_sharding,
+                                      run_check=False), DTensor.from_local(
+                                          residual_tensor,
+                                          device_mesh,
+                                          sequence_sharding,
+                                          run_check=False)
+        else:
+            raise ValueError(
+                f"expecting input of {mod} to be a torch.Tensor or DTensor, but got {input_tensor}"
+            )

torch-ext/activation/rms_norm.py CHANGED Viewed

@@ -91,10 +91,12 @@ def fused_add_rms_norm_abstract(x, residual, weight, eps):
 def fused_add_rms_norm_backward_abstract(output_grad, add_output_grad,
                                          add_output, weight, eps,
                                          need_input_grad: bool):
-    return torch.empty_like(x) if need_input_grad else None, torch.empty_like(
-        weight)
 if version.parse(torch.__version__) >= version.parse("2.8"):
     from .rms_norm_meta import register_rms_norm_meta
     register_rms_norm_meta()

 def fused_add_rms_norm_backward_abstract(output_grad, add_output_grad,
                                          add_output, weight, eps,
                                          need_input_grad: bool):
+    return torch.empty_like(
+        output_grad) if need_input_grad else None, torch.empty_like(weight)
 if version.parse(torch.__version__) >= version.parse("2.8"):
+    from .fused_add_rms_norm_meta import register_fused_add_rms_norm_meta
     from .rms_norm_meta import register_rms_norm_meta
+    register_fused_add_rms_norm_meta()
     register_rms_norm_meta()

torch-ext/activation/rms_norm_meta.py CHANGED Viewed

@@ -48,9 +48,10 @@ def rms_norm_strategy(op_schema: OpSchema) -> OpStrategy:
     last_dim = input_strategy.ndim - 1
     strategy = OpStrategy([])
-    for idx in range(len(input_strategy.strategies)):
-        input_src = input_strategy.strategies[idx].output_spec
-        weight_src = weight_strategy.strategies[idx].output_spec
         assert isinstance(input_src, DTensorSpec)
         assert isinstance(weight_src, DTensorSpec)
@@ -102,16 +103,27 @@ def rms_norm_backward_strategy(op_schema: OpSchema) -> OpStrategy:
     assert isinstance(input_strategy, OpStrategy)
     assert isinstance(weight_strategy, OpStrategy)
-    assert len(input_strategy.strategies) == len(weight_strategy.strategies)
-    assert len(input_strategy.strategies) == len(
-        output_grad_strategy.strategies)
     last_dim = input_strategy.ndim - 1
     strategy = OpStrategy([])
-    for idx in range(len(input_strategy.strategies)):
-        output_grad_src = output_grad_strategy.strategies[idx].output_spec
-        input_src = input_strategy.strategies[idx].output_spec
-        weight_src = weight_strategy.strategies[idx].output_spec
         assert isinstance(output_grad_src, DTensorSpec)
         assert isinstance(input_src, DTensorSpec)
@@ -119,7 +131,7 @@ def rms_norm_backward_strategy(op_schema: OpSchema) -> OpStrategy:
         redistribute_costs = []
-        # Output grad and input can be sharded in any dim except the last dim.
         output_grad_tgt = DTensorSpec(
             mesh=mesh,
             placements=_replicate_dims_start_at(output_grad_src.placements,
@@ -128,12 +140,9 @@ def rms_norm_backward_strategy(op_schema: OpSchema) -> OpStrategy:
         )
         redistribute_costs.append(
             generate_redistribute_costs(output_grad_strategy, output_grad_tgt))
-        input_tgt = DTensorSpec(
-            mesh=mesh,
-            placements=_replicate_dims_start_at(input_src.placements,
-                                                last_dim),
-            tensor_meta=input_src.tensor_meta,
-        )
         redistribute_costs.append(
             generate_redistribute_costs(input_strategy, input_tgt))

     last_dim = input_strategy.ndim - 1
     strategy = OpStrategy([])
+    for input, weight in zip(input_strategy.strategies,
+                             weight_strategy.strategies):
+        input_src = input.output_spec
+        weight_src = weight.output_spec
         assert isinstance(input_src, DTensorSpec)
         assert isinstance(weight_src, DTensorSpec)
     assert isinstance(input_strategy, OpStrategy)
     assert isinstance(weight_strategy, OpStrategy)
+    lengths = {
+        "output_grad": len(output_grad_strategy.strategies),
+        "input": len(input_strategy.strategies),
+        "weight": len(weight_strategy.strategies),
+    }
+    assert len(set(
+        lengths.values())) == 1, f"Strategies length mismatch {lengths}"
+    zipped = zip(
+        output_grad_strategy.strategies,
+        input_strategy.strategies,
+        weight_strategy.strategies,
+    )
     last_dim = input_strategy.ndim - 1
     strategy = OpStrategy([])
+    for output_grad, input, weight in zipped:
+        output_grad_src = output_grad.output_spec
+        input_src = input.output_spec
+        weight_src = weight.output_spec
         assert isinstance(output_grad_src, DTensorSpec)
         assert isinstance(input_src, DTensorSpec)
         redistribute_costs = []
+        # Output grad can be sharded in any dim except the last dim.
         output_grad_tgt = DTensorSpec(
             mesh=mesh,
             placements=_replicate_dims_start_at(output_grad_src.placements,
         )
         redistribute_costs.append(
             generate_redistribute_costs(output_grad_strategy, output_grad_tgt))
+        # Input must have the same sharding as output grad.
+        input_tgt = output_grad_tgt
         redistribute_costs.append(
             generate_redistribute_costs(input_strategy, input_tgt))