Sync with upstream, add tests

Browse files

Files changed (8) hide show

flake.lock +12 -12
flake.nix +3 -0
tests/test_layer_norm.py +373 -0
torch-ext/triton_layer_norm/__init__.py +3 -0
torch-ext/triton_layer_norm/layer_norm.py +338 -244
torch-ext/triton_layer_norm/utils/__init__.py +0 -0
torch-ext/triton_layer_norm/utils/library.py +66 -0
torch-ext/triton_layer_norm/utils/torch.py +21 -0

flake.lock CHANGED Viewed

@@ -73,11 +73,11 @@
         "nixpkgs": "nixpkgs"
       },
       "locked": {
-        "lastModified": 1750234878,
-        "narHash": "sha256-q9DRC9zdpzUf88qqg1qbhP1qgJbE2cMtn8oUmosuyT8=",
         "owner": "huggingface",
         "repo": "hf-nix",
-        "rev": "c7132f90763d756da3e77da62e01be0a4546dc57",
         "type": "github"
       },
       "original": {
@@ -98,11 +98,11 @@
         ]
       },
       "locked": {
-        "lastModified": 1750409351,
-        "narHash": "sha256-xkzrwee77LrBDtwNNihBkYbY7yUwdOv0/4+J3B5xCZE=",
         "owner": "huggingface",
         "repo": "kernel-builder",
-        "rev": "9e61fba877153bffa6eaff023243fd81220c0eea",
         "type": "github"
       },
       "original": {
@@ -113,17 +113,17 @@
     },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1747820358,
-        "narHash": "sha256-fTqsZsUX6M3yeEvgyQvXcbGmT2CaRVyVwsi8eK29Oj4=",
-        "owner": "danieldk",
         "repo": "nixpkgs",
-        "rev": "d3c1681180717528068082103bf323147de6ab0b",
         "type": "github"
       },
       "original": {
-        "owner": "danieldk",
-        "ref": "cudatoolkit-12.9-kernel-builder",
         "repo": "nixpkgs",
         "type": "github"
       }
     },

         "nixpkgs": "nixpkgs"
       },
       "locked": {
+        "lastModified": 1754038838,
+        "narHash": "sha256-oHigCT4z0ayyLyEuxdZooSXRAZP8lfOkZHzY1lx1U50=",
         "owner": "huggingface",
         "repo": "hf-nix",
+        "rev": "336f781fa284e193baa3d4c3ce3f95fb34e9ffad",
         "type": "github"
       },
       "original": {
         ]
       },
       "locked": {
+        "lastModified": 1756320464,
+        "narHash": "sha256-x9LI4h87/Z9UgTQjgeG0fRcdeXl91xIqBlTauGKZM70=",
         "owner": "huggingface",
         "repo": "kernel-builder",
+        "rev": "b4accba4496b28faef19a0487fbcf9686b14e2ef",
         "type": "github"
       },
       "original": {
     },
     "nixpkgs": {
       "locked": {
+        "lastModified": 1752785354,
+        "narHash": "sha256-Y33ryUz7MPqKrZwlbQcsYCUz2jAJCacRf8jbs0tYUlA=",
+        "owner": "nixos",
         "repo": "nixpkgs",
+        "rev": "d38025438a6ee456758dc03188ca6873a415463b",
         "type": "github"
       },
       "original": {
+        "owner": "nixos",
         "repo": "nixpkgs",
+        "rev": "d38025438a6ee456758dc03188ca6873a415463b",
         "type": "github"
       }
     },

flake.nix CHANGED Viewed

@@ -13,5 +13,8 @@
     kernel-builder.lib.genFlakeOutputs {
       path = ./.;
       rev = self.shortRev or self.dirtyShortRev or self.lastModifiedDate;
     };
 }

     kernel-builder.lib.genFlakeOutputs {
       path = ./.;
       rev = self.shortRev or self.dirtyShortRev or self.lastModifiedDate;
+      # Import-time autotune.
+      doGetKernelCheck = false;
+      pythonCheckInputs = pkgs: with pkgs; [ einops ];
     };
 }

tests/test_layer_norm.py ADDED Viewed

	@@ -0,0 +1,373 @@

+# Copyright (c) 2024, Tri Dao.
+import pytest
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from triton_layer_norm import (
+    layer_norm_fn,
+    layer_norm_linear_fn,
+)
+from triton_layer_norm.layer_norm import layer_norm_ref, rms_norm_ref
+is_sm8x = torch.cuda.get_device_capability("cuda")[0] >= 8
+# @pytest.mark.parametrize("zero_centered_weight", [False, True])
+@pytest.mark.parametrize("zero_centered_weight", [False])
+@pytest.mark.parametrize("has_weight1", [False, True])
+# @pytest.mark.parametrize("has_weight1", [False])
+@pytest.mark.parametrize("has_x1", [False, True])
+# @pytest.mark.parametrize("has_x1", [False])
+@pytest.mark.parametrize("has_rowscale", [False, True])
+# @pytest.mark.parametrize("has_rowscale", [False])
+@pytest.mark.parametrize("dropout_p", [0.0, 0.27])
+# @pytest.mark.parametrize("dropout_p", [0.0])
+@pytest.mark.parametrize("prenorm", [True, False])
+# @pytest.mark.parametrize("prenorm", [True])
+@pytest.mark.parametrize("is_rms_norm", [False, True])
+# @pytest.mark.parametrize("is_rms_norm", [True])
+@pytest.mark.parametrize("has_residual", [True, False])
+# @pytest.mark.parametrize("has_residual", [True])
+@pytest.mark.parametrize(
+    "weight_dtype", [torch.float32, torch.float16] + ([torch.bfloat16] if is_sm8x else [])
+)
+# @pytest.mark.parametrize("weight_dtype", [torch.float32])
+@pytest.mark.parametrize(
+    "input_dtype,residual_dtype",
+    [(torch.float16, torch.float16), (torch.float16, torch.float32), (torch.float32, torch.float32)]
+    + ([(torch.bfloat16, torch.bfloat16), (torch.bfloat16, torch.float32)] if is_sm8x else []),
+)
+# @pytest.mark.parametrize("input_dtype,residual_dtype", [(torch.float16, torch.float16)])
+@pytest.mark.parametrize("hidden_size", [192, 2048, 2560, 3000, 4096])
+# @pytest.mark.parametrize("hidden_size", [1024])
+def test_layer_norm(
+    hidden_size,
+    input_dtype,
+    residual_dtype,
+    weight_dtype,
+    has_residual,
+    is_rms_norm,
+    prenorm,
+    dropout_p,
+    has_rowscale,
+    has_x1,
+    has_weight1,
+    zero_centered_weight,
+):
+    if has_rowscale and has_x1:
+        pytest.skip("Not supported")
+    device = "cuda"
+    if any(x == torch.bfloat16 for x in [input_dtype, residual_dtype, weight_dtype]):
+        atol = 5e-2
+    elif any(x == torch.float16 for x in [input_dtype, residual_dtype, weight_dtype]):
+        atol = 1e-2
+    else:
+        atol = 1e-4
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 8
+    seqlen = 512
+    layer_norm_ref_fn = layer_norm_ref if not is_rms_norm else rms_norm_ref
+    allclose = (
+        # Sometimes x0_pt.grad is NaN
+        lambda x, x_pt, x_ref, atol=atol: (x - x_ref).abs().max()
+        <= 2 * (x_pt[~x_pt.isnan()] - x_ref[~x_pt.isnan()]).abs().max() + atol
+        or (
+            # Sometimes x_pt and x_ref are the same (e.g. bfloat16) so we want to perturb is a bit
+            # by multiply and divide by 0.3
+            (x_pt[~x_pt.isnan()] - x_ref[~x_pt.isnan()]).abs().max() == 0.0
+            and (x - x_ref).abs().max()
+            <= 2 * (x_pt[~x_pt.isnan()] * 0.3 / 0.3 - x_ref[~x_pt.isnan()]).abs().max() + atol
+        )
+    )
+    x0 = torch.randn(
+        batch_size, seqlen, hidden_size, device=device, dtype=input_dtype, requires_grad=True
+    )
+    x0_pt = x0.detach().clone().requires_grad_()
+    x0_ref = x0.detach().clone().requires_grad_()
+    if has_residual:
+        res = torch.randn_like(x0, dtype=residual_dtype, requires_grad=True)
+        res_pt = res.detach().clone().requires_grad_()
+        res_ref = res.detach().clone().requires_grad_()
+    else:
+        res, res_pt, res_ref = None, None, None
+    weight = torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True)
+    if not is_rms_norm:
+        bias = torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True)
+    else:
+        bias = None
+    weight_pt = weight.detach().clone().requires_grad_()
+    weight_ref = weight.detach().clone().requires_grad_()
+    bias_pt = bias.detach().clone().requires_grad_() if bias is not None else None
+    bias_ref = bias.detach().clone().requires_grad_() if bias is not None else None
+    if has_x1:
+        x1 = torch.randn_like(x0, dtype=input_dtype, requires_grad=True)
+        x1_pt = x1.detach().clone().requires_grad_()
+        x1_ref = x1.detach().clone().requires_grad_()
+    else:
+        x1, x1_pt, x1_ref = None, None, None
+    if has_weight1:
+        weight1 = torch.randn(
+            hidden_size, device=device, dtype=weight_dtype, requires_grad=True
+        )
+        weight1_pt = weight1.detach().clone().requires_grad_()
+        weight1_ref = weight1.detach().clone().requires_grad_()
+        if not is_rms_norm:
+            bias1 = torch.randn(
+                hidden_size, device=device, dtype=weight_dtype, requires_grad=True
+            )
+        else:
+            bias1 = None
+        bias1_pt = bias1.detach().clone().requires_grad_() if bias1 is not None else None
+        bias1_ref = bias1.detach().clone().requires_grad_() if bias1 is not None else None
+    else:
+        weight1, weight1_pt, weight1_ref = None, None, None
+        bias1, bias1_pt, bias1_ref = None, None, None
+    rowscale = (
+        torch.randn(batch_size, seqlen, dtype=input_dtype, device=device)
+        if has_rowscale
+        else None
+    )
+    residual_in_fp32 = (not has_residual) and residual_dtype == torch.float32
+    out, *rest = layer_norm_fn(
+        x0,
+        weight,
+        bias,
+        residual=res,
+        x1=x1,
+        weight1=weight1,
+        bias1=bias1,
+        eps=1e-6,
+        dropout_p=dropout_p,
+        rowscale=rowscale,
+        prenorm=prenorm,
+        residual_in_fp32=residual_in_fp32,
+        zero_centered_weight=zero_centered_weight,
+        is_rms_norm=is_rms_norm,
+        return_dropout_mask=True,
+    )
+    dropout_mask = rest[-2] if dropout_p > 0.0 else None
+    dropout_mask1 = rest[-1] if dropout_p > 0.0 and x1 is not None else None
+    out_pt = layer_norm_ref_fn(
+        x0_pt,
+        weight_pt,
+        bias_pt,
+        residual=res_pt,
+        x1=x1_pt,
+        weight1=weight1_pt,
+        bias1=bias1_pt,
+        eps=1e-6,
+        dropout_p=dropout_p,
+        rowscale=rowscale,
+        prenorm=prenorm,
+        zero_centered_weight=zero_centered_weight,
+        dropout_mask=dropout_mask,
+        dropout_mask1=dropout_mask1,
+    )
+    out_ref = layer_norm_ref_fn(
+        x0_ref,
+        weight_ref,
+        bias_ref,
+        residual=res_ref,
+        x1=x1_ref,
+        weight1=weight1_ref,
+        bias1=bias1_ref,
+        eps=1e-6,
+        dropout_p=dropout_p,
+        rowscale=rowscale,
+        prenorm=prenorm,
+        zero_centered_weight=zero_centered_weight,
+        dropout_mask=dropout_mask,
+        dropout_mask1=dropout_mask1,
+        upcast=True,
+    )
+    if not has_weight1:
+        if prenorm:
+            residual = rest[0]
+            out_pt, residual_pt = out_pt
+            out_ref, residual_ref = out_ref
+        out1, out1_pt, out1_ref = None, None, None
+    else:
+        out1 = rest.pop(0)
+        if prenorm:
+            residual = rest[0]
+            out_pt, out1_pt, residual_pt = out_pt
+            out_ref, out1_ref, residual_ref = out_ref
+        else:
+            out_pt, out1_pt = out_pt
+            out_ref, out1_ref = out_ref
+    assert out.dtype == input_dtype
+    if prenorm:
+        assert residual.dtype == residual_dtype
+        assert allclose(residual, residual_pt, residual_ref)
+    assert allclose(out, out_pt, out_ref)
+    if out1 is not None:
+        assert out1.dtype == input_dtype
+        assert allclose(out1, out1_pt, out1_ref)
+    if dropout_mask is not None:
+        dropout_fraction = 1.0 - dropout_mask.float().mean()
+        assert abs(dropout_fraction - dropout_p) < 0.01
+    if dropout_mask1 is not None:
+        dropout_fraction = 1.0 - dropout_mask1.float().mean()
+        assert abs(dropout_fraction - dropout_p) < 0.01
+        assert not torch.equal(dropout_mask, dropout_mask1)
+    g = torch.randn_like(out) / batch_size
+    if has_weight1:
+        out = out * F.gelu(out1)
+        out_pt = out_pt * F.gelu(out1_pt)
+        out_ref = out_ref * F.gelu(out1_ref)
+    if not prenorm:
+        out.backward(g)
+        out_pt.backward(g)
+        out_ref.backward(g)
+    else:
+        (out * F.sigmoid(residual)).backward(g)
+        (out_pt * F.sigmoid(residual_pt)).backward(g)
+        (out_ref * F.sigmoid(residual_ref.to(dtype=residual_dtype))).backward(g)
+    assert allclose(x0.grad, x0_pt.grad, x0_ref.grad)
+    if has_residual:
+        assert allclose(res.grad, res_pt.grad, res_ref.grad)
+    if has_x1:
+        assert allclose(x1.grad, x1_pt.grad, x1_ref.grad)
+    assert allclose(weight.grad, weight_pt.grad, weight_ref.grad)
+    if bias is not None:
+        assert allclose(bias.grad, bias_pt.grad, bias_ref.grad)
+    if has_weight1:
+        assert allclose(weight1.grad, weight1_pt.grad, weight1_ref.grad)
+        if bias1 is not None:
+            assert allclose(bias1.grad, bias1_pt.grad, bias1_ref.grad)
+@pytest.mark.parametrize("prenorm", [True, False])
+# @pytest.mark.parametrize("prenorm", [True])
+@pytest.mark.parametrize("is_rms_norm", [False, True])
+# @pytest.mark.parametrize("is_rms_norm", [True])
+@pytest.mark.parametrize("has_residual", [True, False])
+# @pytest.mark.parametrize("has_residual", [False])
+@pytest.mark.parametrize("weight_dtype", [torch.float32])
+@pytest.mark.parametrize(
+    "input_dtype,residual_dtype",
+    [(torch.float16, torch.float16), (torch.float16, torch.float32)]
+    + ([(torch.bfloat16, torch.bfloat16), (torch.bfloat16, torch.float32)] if is_sm8x else []),
+)
+# @pytest.mark.parametrize("input_dtype,residual_dtype", [(torch.bfloat16, torch.float32)])
+@pytest.mark.parametrize("hidden_size", [192, 2048, 2560, 3000])
+# @pytest.mark.parametrize("hidden_size", [256])
+def test_layer_norm_linear(
+    hidden_size, input_dtype, residual_dtype, weight_dtype, has_residual, is_rms_norm, prenorm
+):
+    device = "cuda"
+    if any(x == torch.bfloat16 for x in [input_dtype, residual_dtype, weight_dtype]):
+        atol = 5e-2
+    elif any(x == torch.float16 for x in [input_dtype, residual_dtype, weight_dtype]):
+        atol = 1e-2
+    else:
+        atol = 1e-4
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 4
+    seqlen = 512
+    # batch_size = 1
+    # seqlen = 1
+    layer_norm_ref_fn = layer_norm_ref if not is_rms_norm else rms_norm_ref
+    allclose = (
+        lambda x, x_pt, x_ref, atol=atol: (x - x_ref).abs().max()
+        <= 2 * (x_pt - x_ref).abs().max() + atol
+    )
+    x0 = torch.randn(
+        batch_size, seqlen, hidden_size, device=device, dtype=input_dtype, requires_grad=True
+    )
+    x0_pt = x0.detach().clone().requires_grad_()
+    x0_ref = x0.detach().clone().requires_grad_()
+    if has_residual:
+        res = torch.randn_like(x0, dtype=residual_dtype, requires_grad=True)
+        res_pt = res.detach().clone().requires_grad_()
+        res_ref = res.detach().clone().requires_grad_()
+    else:
+        res, res_pt, res_ref = None, None, None
+    norm_weight = torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True)
+    if not is_rms_norm:
+        norm_bias = torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True)
+    else:
+        norm_bias = None
+    norm_weight_pt = norm_weight.detach().clone().requires_grad_()
+    norm_weight_ref = norm_weight.detach().clone().requires_grad_()
+    norm_bias_pt = norm_bias.detach().clone().requires_grad_() if norm_bias is not None else None
+    norm_bias_ref = norm_bias.detach().clone().requires_grad_() if norm_bias is not None else None
+    linear_weight = torch.empty(
+        2 * hidden_size, hidden_size, device=device, dtype=weight_dtype, requires_grad=True
+    )
+    torch.nn.init.xavier_uniform_(linear_weight)
+    if not is_rms_norm:
+        linear_bias = torch.randn(
+            2 * hidden_size, device=device, dtype=weight_dtype, requires_grad=True
+        )
+    else:
+        linear_bias = None
+    linear_weight_pt = linear_weight.detach().clone().requires_grad_()
+    linear_weight_ref = linear_weight.detach().clone().requires_grad_()
+    linear_bias_pt = (
+        linear_bias.detach().clone().requires_grad_() if linear_bias is not None else None
+    )
+    linear_bias_ref = (
+        linear_bias.detach().clone().requires_grad_() if linear_bias is not None else None
+    )
+    residual_in_fp32 = (not has_residual) and residual_dtype == torch.float32
+    with torch.autocast(device_type="cuda", dtype=input_dtype):
+        out, *rest = layer_norm_linear_fn(
+            x0,
+            norm_weight,
+            norm_bias,
+            linear_weight,
+            linear_bias,
+            residual=res,
+            eps=1e-6,
+            prenorm=prenorm,
+            residual_in_fp32=residual_in_fp32,
+            is_rms_norm=is_rms_norm,
+        )
+    out_pt, *rest_pt = layer_norm_ref_fn(
+        x0_pt, norm_weight_pt, norm_bias_pt, residual=res_pt, eps=1e-6, prenorm=prenorm
+    )
+    with torch.autocast(device_type="cuda", dtype=input_dtype):
+        out_pt = F.linear(out_pt, linear_weight_pt, linear_bias_pt)
+    out_ref, *rest_ref = layer_norm_ref_fn(
+        x0_ref,
+        norm_weight_ref,
+        norm_bias_ref,
+        residual=res_ref,
+        eps=1e-6,
+        prenorm=prenorm,
+        upcast=True,
+    )
+    out_ref = F.linear(out_ref.to(linear_weight_ref.dtype), linear_weight_ref, linear_bias_ref)
+    if prenorm:
+        residual = rest[0]
+        residual_pt = rest_pt[0]
+        residual_ref = rest_ref[0]
+    assert out.dtype == input_dtype
+    if prenorm:
+        assert residual.dtype == residual_dtype
+        assert allclose(residual, residual_pt, residual_ref)
+    assert allclose(out, out_pt, out_ref)
+    g = torch.randn_like(out) / batch_size
+    out.backward(g)
+    out_pt.backward(g)
+    out_ref.backward(g)
+    assert allclose(x0.grad, x0_pt.grad, x0_ref.grad)
+    if has_residual:
+        assert allclose(res.grad, res_pt.grad, res_ref.grad)
+    assert allclose(norm_weight.grad, norm_weight_pt.grad, norm_weight_ref.grad)
+    if norm_bias is not None:
+        assert allclose(norm_bias.grad, norm_bias_pt.grad, norm_bias_ref.grad)
+    assert allclose(linear_weight.grad, linear_weight_pt.grad, linear_weight_ref.grad)
+    if linear_bias is not None:
+        assert allclose(linear_bias.grad, linear_bias_pt.grad, linear_bias_ref.grad)

torch-ext/triton_layer_norm/__init__.py CHANGED Viewed

@@ -25,6 +25,7 @@ def layer_norm(
     rowscale=None,
     prenorm: bool = False,
     residual_in_fp32: bool = False,
     is_rms_norm: bool = False,
     return_dropout_mask: bool = False,
     out: Optional[torch.Tensor] = None,
@@ -61,6 +62,8 @@ def layer_norm(
             If True, returns both the normalized output and the unnormalized input+residual.
         residual_in_fp32 (`bool`, *optional*, defaults to False):
             If True, performs the residual connection in FP32 precision.
         is_rms_norm (`bool`, *optional*, defaults to False):
             If True, uses RMS normalization instead of layer normalization.
         return_dropout_mask (`bool`, *optional*, defaults to False):

     rowscale=None,
     prenorm: bool = False,
     residual_in_fp32: bool = False,
+    zero_centered_weight: bool = False,
     is_rms_norm: bool = False,
     return_dropout_mask: bool = False,
     out: Optional[torch.Tensor] = None,
             If True, returns both the normalized output and the unnormalized input+residual.
         residual_in_fp32 (`bool`, *optional*, defaults to False):
             If True, performs the residual connection in FP32 precision.
+        zero_centered_weight (`bool`, *optional*, defaults to False):
+            When set to true, 1.0 is added to the weight before applying it.
         is_rms_norm (`bool`, *optional*, defaults to False):
             If True, uses RMS normalization instead of layer normalization.
         return_dropout_mask (`bool`, *optional*, defaults to False):

torch-ext/triton_layer_norm/layer_norm.py CHANGED Viewed

@@ -7,14 +7,40 @@
 # The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine.
 import math
 import torch
 import torch.nn.functional as F
-from torch.amp import custom_fwd, custom_bwd
 import triton
 import triton.language as tl
 def layer_norm_ref(
     x,
@@ -28,6 +54,7 @@ def layer_norm_ref(
     dropout_p=0.0,
     rowscale=None,
     prenorm=False,
     dropout_mask=None,
     dropout_mask1=None,
     upcast=False,
@@ -41,6 +68,10 @@ def layer_norm_ref(
         x1 = x1.float() if x1 is not None else None
         weight1 = weight1.float() if weight1 is not None else None
         bias1 = bias1.float() if bias1 is not None else None
     if x1 is not None:
         assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
     if rowscale is not None:
@@ -59,9 +90,9 @@ def layer_norm_ref(
         x = x + x1
     if residual is not None:
         x = (x + residual).to(x.dtype)
-    out = F.layer_norm(
-        x.to(weight.dtype), x.shape[-1:], weight=weight, bias=bias, eps=eps
-    ).to(dtype)
     if weight1 is None:
         return out if not prenorm else (out, x)
     else:
@@ -83,6 +114,7 @@ def rms_norm_ref(
     dropout_p=0.0,
     rowscale=None,
     prenorm=False,
     dropout_mask=None,
     dropout_mask1=None,
     upcast=False,
@@ -96,6 +128,10 @@ def rms_norm_ref(
         x1 = x1.float() if x1 is not None else None
         weight1 = weight1.float() if weight1 is not None else None
         bias1 = bias1.float() if bias1 is not None else None
     if x1 is not None:
         assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
     if rowscale is not None:
@@ -115,34 +151,26 @@ def rms_norm_ref(
     if residual is not None:
         x = (x + residual).to(x.dtype)
     rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps)
-    out = ((x * rstd * weight) + bias if bias is not None else (x * rstd * weight)).to(
-        dtype
-    )
     if weight1 is None:
         return out if not prenorm else (out, x)
     else:
-        out1 = (
-            (x * rstd * weight1) + bias1 if bias1 is not None else (x * rstd * weight1)
-        ).to(dtype)
         return (out, out1) if not prenorm else (out, out1, x)
 @triton.autotune(
-    configs=[
-        triton.Config({}, num_warps=1),
-        triton.Config({}, num_warps=2),
-        triton.Config({}, num_warps=4),
-        triton.Config({}, num_warps=8),
-        triton.Config({}, num_warps=16),
-        triton.Config({}, num_warps=32),
-    ],
-    key=["N", "HAS_RESIDUAL", "STORE_RESIDUAL_OUT", "IS_RMS_NORM", "HAS_BIAS"],
 )
 # @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
 # @triton.heuristics({"HAS_RESIDUAL": lambda args: args["RESIDUAL"] is not None})
-@triton.heuristics({"HAS_X1": lambda args: args["X1"] is not None})
-@triton.heuristics({"HAS_W1": lambda args: args["W1"] is not None})
-@triton.heuristics({"HAS_B1": lambda args: args["B1"] is not None})
 @triton.jit
 def _layer_norm_fwd_1pass_kernel(
     X,  # pointer to the input
@@ -158,6 +186,7 @@ def _layer_norm_fwd_1pass_kernel(
     ROWSCALE,
     SEEDS,  # Dropout seeds for each row
     DROPOUT_MASK,
     Mean,  # pointer to the mean
     Rstd,  # pointer to the 1/std
     stride_x_row,  # how much to increase the pointer when moving by 1 row
@@ -170,6 +199,7 @@ def _layer_norm_fwd_1pass_kernel(
     N,  # number of columns in X
     eps,  # epsilon to avoid division by zero
     dropout_p,  # Dropout probability
     IS_RMS_NORM: tl.constexpr,
     BLOCK_N: tl.constexpr,
     HAS_RESIDUAL: tl.constexpr,
@@ -203,9 +233,7 @@ def _layer_norm_fwd_1pass_kernel(
     if HAS_DROPOUT:
         # Compute dropout mask
         # 7 rounds is good enough, and reduces register pressure
-        keep_mask = (
-            tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
-        )
         x = tl.where(keep_mask, x / (1.0 - dropout_p), 0.0)
         if STORE_DROPOUT_MASK:
             tl.store(DROPOUT_MASK + row * N + cols, keep_mask, mask=cols < N)
@@ -218,12 +246,11 @@ def _layer_norm_fwd_1pass_kernel(
             # Compute dropout mask
             # 7 rounds is good enough, and reduces register pressure
             keep_mask = (
-                tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7)
-                > dropout_p
             )
             x1 = tl.where(keep_mask, x1 / (1.0 - dropout_p), 0.0)
             if STORE_DROPOUT_MASK:
-                tl.store(DROPOUT_MASK + (M + row) * N + cols, keep_mask, mask=cols < N)
         x += x1
     if HAS_RESIDUAL:
         residual = tl.load(RESIDUAL + cols, mask=cols < N, other=0.0).to(tl.float32)
@@ -243,6 +270,8 @@ def _layer_norm_fwd_1pass_kernel(
     # Normalize and apply linear transformation
     mask = cols < N
     w = tl.load(W + cols, mask=mask).to(tl.float32)
     if HAS_BIAS:
         b = tl.load(B + cols, mask=mask).to(tl.float32)
     x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
@@ -251,6 +280,8 @@ def _layer_norm_fwd_1pass_kernel(
     tl.store(Y + cols, y, mask=mask)
     if HAS_W1:
         w1 = tl.load(W1 + cols, mask=mask).to(tl.float32)
         if HAS_B1:
             b1 = tl.load(B1 + cols, mask=mask).to(tl.float32)
         y1 = x_hat * w1 + b1 if HAS_B1 else x_hat * w1
@@ -258,25 +289,87 @@ def _layer_norm_fwd_1pass_kernel(
 def _layer_norm_fwd(
-    x,
-    weight,
-    bias,
-    eps,
-    residual=None,
-    x1=None,
-    weight1=None,
-    bias1=None,
-    dropout_p=0.0,
-    rowscale=None,
-    out_dtype=None,
-    residual_dtype=None,
-    is_rms_norm=False,
-    return_dropout_mask=False,
-    out=None,
-    residual_out=None,
-):
     if residual is not None:
         residual_dtype = residual.dtype
     M, N = x.shape
     assert x.stride(-1) == 1
     if residual is not None:
@@ -300,41 +393,17 @@ def _layer_norm_fwd(
     if rowscale is not None:
         assert rowscale.is_contiguous()
         assert rowscale.shape == (M,)
-    # allocate output
-    if out is None:
-        out = torch.empty_like(x, dtype=x.dtype if out_dtype is None else out_dtype)
-    else:
-        assert out.shape == x.shape
     assert out.stride(-1) == 1
     if weight1 is not None:
         y1 = torch.empty_like(out)
         assert y1.stride(-1) == 1
     else:
         y1 = None
-    if (
-        residual is not None
-        or (residual_dtype is not None and residual_dtype != x.dtype)
-        or dropout_p > 0.0
-        or rowscale is not None
-        or x1 is not None
-    ):
-        if residual_out is None:
-            residual_out = torch.empty(
-                M,
-                N,
-                device=x.device,
-                dtype=residual_dtype if residual_dtype is not None else x.dtype,
-            )
-        else:
-            assert residual_out.shape == x.shape
-        assert residual_out.stride(-1) == 1
-    else:
-        residual_out = None
-    mean = (
-        torch.empty((M,), dtype=torch.float32, device=x.device)
-        if not is_rms_norm
-        else None
-    )
     rstd = torch.empty((M,), dtype=torch.float32, device=x.device)
     if dropout_p > 0.0:
         seeds = torch.randint(
@@ -343,18 +412,20 @@ def _layer_norm_fwd(
     else:
         seeds = None
     if return_dropout_mask and dropout_p > 0.0:
-        dropout_mask = torch.empty(
-            M if x1 is None else 2 * M, N, device=x.device, dtype=torch.bool
-        )
     else:
-        dropout_mask = None
     # Less than 64KB per feature: enqueue fused kernel
     MAX_FUSED_SIZE = 65536 // x.element_size()
     BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
     if N > BLOCK_N:
         raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
     with torch.cuda.device(x.device.index):
-        _layer_norm_fwd_1pass_kernel[(M,)](
             x,
             out,
             weight,
@@ -368,6 +439,7 @@ def _layer_norm_fwd(
             rowscale,
             seeds,
             dropout_mask,
             mean,
             rstd,
             x.stride(0),
@@ -380,6 +452,8 @@ def _layer_norm_fwd(
             N,
             eps,
             dropout_p,
             is_rms_norm,
             BLOCK_N,
             residual is not None,
@@ -388,50 +462,26 @@ def _layer_norm_fwd(
             dropout_p > 0.0,
             dropout_mask is not None,
             rowscale is not None,
         )
-    # residual_out is None if residual is None and residual_dtype == input_dtype and dropout_p == 0.0
-    if dropout_mask is not None and x1 is not None:
-        dropout_mask, dropout_mask1 = dropout_mask.tensor_split(2, dim=0)
-    else:
-        dropout_mask1 = None
-    return (
-        out,
-        y1,
-        mean,
-        rstd,
-        residual_out if residual_out is not None else x,
-        seeds,
-        dropout_mask,
-        dropout_mask1,
-    )
 @triton.autotune(
-    configs=[
-        triton.Config({}, num_warps=1),
-        triton.Config({}, num_warps=2),
-        triton.Config({}, num_warps=4),
-        triton.Config({}, num_warps=8),
-        triton.Config({}, num_warps=16),
-        triton.Config({}, num_warps=32),
-    ],
-    key=[
-        "N",
-        "HAS_DRESIDUAL",
-        "STORE_DRESIDUAL",
-        "IS_RMS_NORM",
-        "HAS_BIAS",
-        "HAS_DROPOUT",
-    ],
 )
 # @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
 # @triton.heuristics({"HAS_DRESIDUAL": lambda args: args["DRESIDUAL"] is not None})
 # @triton.heuristics({"STORE_DRESIDUAL": lambda args: args["DRESIDUAL_IN"] is not None})
-@triton.heuristics({"HAS_ROWSCALE": lambda args: args["ROWSCALE"] is not None})
-@triton.heuristics({"HAS_DY1": lambda args: args["DY1"] is not None})
-@triton.heuristics({"HAS_DX1": lambda args: args["DX1"] is not None})
-@triton.heuristics({"HAS_B1": lambda args: args["DB1"] is not None})
-@triton.heuristics({"RECOMPUTE_OUTPUT": lambda args: args["Y"] is not None})
 @triton.jit
 def _layer_norm_bwd_kernel(
     X,  # pointer to the input
@@ -465,6 +515,7 @@ def _layer_norm_bwd_kernel(
     N,  # number of columns in X
     eps,  # epsilon to avoid division by zero
     dropout_p,
     rows_per_program,
     IS_RMS_NORM: tl.constexpr,
     BLOCK_N: tl.constexpr,
@@ -498,10 +549,14 @@ def _layer_norm_bwd_kernel(
     if RECOMPUTE_OUTPUT:
         Y += row_start * stride_y_row
     w = tl.load(W + cols, mask=mask).to(tl.float32)
     if RECOMPUTE_OUTPUT and HAS_BIAS:
         b = tl.load(B + cols, mask=mask, other=0.0).to(tl.float32)
     if HAS_DY1:
         w1 = tl.load(W1 + cols, mask=mask).to(tl.float32)
     dw = tl.zeros((BLOCK_N,), dtype=tl.float32)
     if HAS_BIAS:
         db = tl.zeros((BLOCK_N,), dtype=tl.float32)
@@ -550,18 +605,14 @@ def _layer_norm_bwd_kernel(
         if HAS_DX1:
             if HAS_DROPOUT:
                 keep_mask = (
-                    tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7)
-                    > dropout_p
                 )
                 dx1 = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
             else:
                 dx1 = dx
             tl.store(DX1 + cols, dx1, mask=mask)
         if HAS_DROPOUT:
-            keep_mask = (
-                tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7)
-                > dropout_p
-            )
             dx = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
         if HAS_ROWSCALE:
             rowscale = tl.load(ROWSCALE + row).to(tl.float32)
@@ -591,31 +642,93 @@ def _layer_norm_bwd_kernel(
 def _layer_norm_bwd(
-    dy,
-    x,
-    weight,
-    bias,
-    eps,
-    mean,
-    rstd,
-    dresidual=None,
-    dy1=None,
-    weight1=None,
-    bias1=None,
-    seeds=None,
-    dropout_p=0.0,
-    rowscale=None,
-    has_residual=False,
-    has_x1=False,
-    is_rms_norm=False,
-    x_dtype=None,
-    recompute_output=False,
-):
     M, N = x.shape
     assert x.stride(-1) == 1
     assert dy.stride(-1) == 1
     assert dy.shape == (M, N)
     if dresidual is not None:
         assert dresidual.stride(-1) == 1
         assert dresidual.shape == (M, N)
     assert weight.shape == (N,)
@@ -624,6 +737,7 @@ def _layer_norm_bwd(
         assert bias.stride(-1) == 1
         assert bias.shape == (N,)
     if dy1 is not None:
         assert weight1 is not None
         assert dy1.shape == dy.shape
         assert dy1.stride(-1) == 1
@@ -652,22 +766,18 @@ def _layer_norm_bwd(
         else None
     )
     dx1 = torch.empty_like(dx) if (has_x1 and dropout_p > 0.0) else None
-    y = (
-        torch.empty(M, N, dtype=dy.dtype, device=dy.device)
-        if recompute_output
-        else None
-    )
     if recompute_output:
-        assert (
-            weight1 is None
-        ), "recompute_output is not supported with parallel LayerNorm"
     # Less than 64KB per feature: enqueue fused kernel
     MAX_FUSED_SIZE = 65536 // x.element_size()
     BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
     if N > BLOCK_N:
         raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
-    sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count
     _dw = torch.empty((sm_count, N), dtype=torch.float32, device=weight.device)
     _db = (
         torch.empty((sm_count, N), dtype=torch.float32, device=bias.device)
@@ -679,7 +789,7 @@ def _layer_norm_bwd(
     rows_per_program = math.ceil(M / sm_count)
     grid = (sm_count,)
     with torch.cuda.device(x.device.index):
-        _layer_norm_bwd_kernel[grid](
             x,
             weight,
             bias,
@@ -711,6 +821,8 @@ def _layer_norm_bwd(
             N,
             eps,
             dropout_p,
             rows_per_program,
             is_rms_norm,
             BLOCK_N,
@@ -718,24 +830,22 @@ def _layer_norm_bwd(
             dresidual_in is not None,
             bias is not None,
             dropout_p > 0.0,
         )
     dw = _dw.sum(0).to(weight.dtype)
     db = _db.sum(0).to(bias.dtype) if bias is not None else None
     dw1 = _dw1.sum(0).to(weight1.dtype) if weight1 is not None else None
     db1 = _db1.sum(0).to(bias1.dtype) if bias1 is not None else None
-    # Don't need to compute dresidual_in separately in this case
-    if has_residual and dx.dtype == x.dtype and dropout_p == 0.0 and rowscale is None:
-        dresidual_in = dx
-    if has_x1 and dropout_p == 0.0:
-        dx1 = dx
-    return (
-        (dx, dw, db, dresidual_in, dx1, dw1, db1)
-        if not recompute_output
-        else (dx, dw, db, dresidual_in, dx1, dw1, db1, y)
-    )
 class LayerNormFn(torch.autograd.Function):
     @staticmethod
     def forward(
         ctx,
@@ -751,34 +861,27 @@ class LayerNormFn(torch.autograd.Function):
         rowscale=None,
         prenorm=False,
         residual_in_fp32=False,
         is_rms_norm=False,
         return_dropout_mask=False,
         out=None,
-        residual_out=None,
     ):
         x_shape_og = x.shape
         # reshape input data into 2D tensor
-        x = x.reshape(-1, x.shape[-1])
-        if x.stride(-1) != 1:
-            x = x.contiguous()
         if residual is not None:
             assert residual.shape == x_shape_og
-            residual = residual.reshape(-1, residual.shape[-1])
-            if residual.stride(-1) != 1:
-                residual = residual.contiguous()
         if x1 is not None:
             assert x1.shape == x_shape_og
             assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
-            x1 = x1.reshape(-1, x1.shape[-1])
-            if x1.stride(-1) != 1:
-                x1 = x1.contiguous()
         weight = weight.contiguous()
-        if bias is not None:
-            bias = bias.contiguous()
-        if weight1 is not None:
-            weight1 = weight1.contiguous()
-        if bias1 is not None:
-            bias1 = bias1.contiguous()
         if rowscale is not None:
             rowscale = rowscale.reshape(-1).contiguous()
         residual_dtype = (
@@ -790,24 +893,24 @@ class LayerNormFn(torch.autograd.Function):
             out = out.reshape(-1, out.shape[-1])
         if residual_out is not None:
             residual_out = residual_out.reshape(-1, residual_out.shape[-1])
-        y, y1, mean, rstd, residual_out, seeds, dropout_mask, dropout_mask1 = (
-            _layer_norm_fwd(
-                x,
-                weight,
-                bias,
-                eps,
-                residual,
-                x1,
-                weight1,
-                bias1,
-                dropout_p=dropout_p,
-                rowscale=rowscale,
-                residual_dtype=residual_dtype,
-                is_rms_norm=is_rms_norm,
-                return_dropout_mask=return_dropout_mask,
-                out=out,
-                residual_out=residual_out,
-            )
         )
         ctx.save_for_backward(
             residual_out, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd
@@ -820,17 +923,12 @@ class LayerNormFn(torch.autograd.Function):
         ctx.has_x1 = x1 is not None
         ctx.prenorm = prenorm
         ctx.x_dtype = x.dtype
         y = y.reshape(x_shape_og)
         y1 = y1.reshape(x_shape_og) if y1 is not None else None
-        residual_out = (
-            residual_out.reshape(x_shape_og) if residual_out is not None else None
-        )
-        dropout_mask = (
-            dropout_mask.reshape(x_shape_og) if dropout_mask is not None else None
-        )
-        dropout_mask1 = (
-            dropout_mask1.reshape(x_shape_og) if dropout_mask1 is not None else None
-        )
         if not return_dropout_mask:
             if weight1 is None:
                 return y if not prenorm else (y, residual_out)
@@ -854,26 +952,19 @@ class LayerNormFn(torch.autograd.Function):
     def backward(ctx, dy, *args):
         x, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd = ctx.saved_tensors
         dy = dy.reshape(-1, dy.shape[-1])
-        if dy.stride(-1) != 1:
-            dy = dy.contiguous()
-        assert dy.shape == x.shape
         if weight1 is not None:
             dy1, args = args[0], args[1:]
             dy1 = dy1.reshape(-1, dy1.shape[-1])
-            if dy1.stride(-1) != 1:
-                dy1 = dy1.contiguous()
             assert dy1.shape == x.shape
         else:
             dy1 = None
         if ctx.prenorm:
             dresidual = args[0]
             dresidual = dresidual.reshape(-1, dresidual.shape[-1])
-            if dresidual.stride(-1) != 1:
-                dresidual = dresidual.contiguous()
             assert dresidual.shape == x.shape
         else:
             dresidual = None
-        dx, dw, db, dresidual_in, dx1, dw1, db1 = _layer_norm_bwd(
             dy,
             x,
             weight,
@@ -890,8 +981,10 @@ class LayerNormFn(torch.autograd.Function):
             rowscale,
             ctx.has_residual,
             ctx.has_x1,
             ctx.is_rms_norm,
             x_dtype=ctx.x_dtype,
         )
         return (
             dx.reshape(ctx.x_shape_og),
@@ -910,6 +1003,8 @@ class LayerNormFn(torch.autograd.Function):
             None,
             None,
             None,
         )
@@ -926,10 +1021,12 @@ def layer_norm_fn(
     rowscale=None,
     prenorm=False,
     residual_in_fp32=False,
     is_rms_norm=False,
     return_dropout_mask=False,
     out=None,
-    residual_out=None,
 ):
     return LayerNormFn.apply(
         x,
@@ -944,10 +1041,12 @@ def layer_norm_fn(
         rowscale,
         prenorm,
         residual_in_fp32,
         is_rms_norm,
         return_dropout_mask,
         out,
-        residual_out,
     )
@@ -964,9 +1063,11 @@ def rms_norm_fn(
     rowscale=None,
     prenorm=False,
     residual_in_fp32=False,
     return_dropout_mask=False,
     out=None,
-    residual_out=None,
 ):
     return LayerNormFn.apply(
         x,
@@ -981,16 +1082,19 @@ def rms_norm_fn(
         rowscale,
         prenorm,
         residual_in_fp32,
         True,
         return_dropout_mask,
         out,
-        residual_out,
     )
 class RMSNorm(torch.nn.Module):
-    def __init__(self, hidden_size, eps=1e-5, dropout_p=0.0, device=None, dtype=None):
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
         self.eps = eps
@@ -998,12 +1102,16 @@ class RMSNorm(torch.nn.Module):
             self.drop = torch.nn.Dropout(dropout_p)
         else:
             self.drop = None
         self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
         self.register_parameter("bias", None)
         self.reset_parameters()
     def reset_parameters(self):
-        torch.nn.init.ones_(self.weight)
     def forward(self, x, residual=None, prenorm=False, residual_in_fp32=False):
         return rms_norm_fn(
@@ -1015,12 +1123,14 @@ class RMSNorm(torch.nn.Module):
             dropout_p=self.drop.p if self.drop is not None and self.training else 0.0,
             prenorm=prenorm,
             residual_in_fp32=residual_in_fp32,
         )
 class LayerNormLinearFn(torch.autograd.Function):
     @staticmethod
-    @custom_fwd(device_type="cuda")
     def forward(
         ctx,
         x,
@@ -1036,17 +1146,12 @@ class LayerNormLinearFn(torch.autograd.Function):
     ):
         x_shape_og = x.shape
         # reshape input data into 2D tensor
-        x = x.reshape(-1, x.shape[-1])
-        if x.stride(-1) != 1:
-            x = x.contiguous()
         if residual is not None:
             assert residual.shape == x_shape_og
-            residual = residual.reshape(-1, residual.shape[-1])
-            if residual.stride(-1) != 1:
-                residual = residual.contiguous()
         norm_weight = norm_weight.contiguous()
-        if norm_bias is not None:
-            norm_bias = norm_bias.contiguous()
         residual_dtype = (
             residual.dtype
             if residual is not None
@@ -1058,25 +1163,17 @@ class LayerNormLinearFn(torch.autograd.Function):
             norm_bias,
             eps,
             residual,
-            out_dtype=(
-                None
-                if not torch.is_autocast_enabled()
-                else torch.get_autocast_gpu_dtype()
-            ),
             residual_dtype=residual_dtype,
             is_rms_norm=is_rms_norm,
         )
         y = y.reshape(x_shape_og)
-        dtype = (
-            torch.get_autocast_gpu_dtype() if torch.is_autocast_enabled() else y.dtype
-        )
         linear_weight = linear_weight.to(dtype)
         linear_bias = linear_bias.to(dtype) if linear_bias is not None else None
         out = F.linear(y.to(linear_weight.dtype), linear_weight, linear_bias)
         # We don't store y, will be recomputed in the backward pass to save memory
-        ctx.save_for_backward(
-            residual_out, norm_weight, norm_bias, linear_weight, mean, rstd
-        )
         ctx.x_shape_og = x_shape_og
         ctx.eps = eps
         ctx.is_rms_norm = is_rms_norm
@@ -1087,20 +1184,17 @@ class LayerNormLinearFn(torch.autograd.Function):
         return out if not prenorm else (out, residual_out.reshape(x_shape_og))
     @staticmethod
-    @custom_bwd(device_type="cuda")
     def backward(ctx, dout, *args):
         x, norm_weight, norm_bias, linear_weight, mean, rstd = ctx.saved_tensors
         dout = dout.reshape(-1, dout.shape[-1])
         dy = F.linear(dout, linear_weight.t())
         dlinear_bias = None if ctx.linear_bias_is_none else dout.sum(0)
-        if dy.stride(-1) != 1:
-            dy = dy.contiguous()
         assert dy.shape == x.shape
         if ctx.prenorm:
             dresidual = args[0]
-            dresidual = dresidual.reshape(-1, dresidual.shape[-1])
-            if dresidual.stride(-1) != 1:
-                dresidual = dresidual.contiguous()
             assert dresidual.shape == x.shape
         else:
             dresidual = None

 # The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine.
 import math
+from typing import Optional, List
 import torch
 import torch.nn.functional as F
+from torch import Tensor
 import triton
 import triton.language as tl
+from ._ops import add_op_namespace_prefix
+from .utils.torch import custom_fwd, custom_bwd
+from .utils.library import triton_op
+def maybe_contiguous_lastdim(x):
+    return x.contiguous() if x is not None and x.stride(-1) != 1 else x
+def maybe_contiguous(x):
+    return x.contiguous() if x is not None else None
+def triton_autotune_configs():
+    # Return configs with a valid warp count for the current device
+    configs = []
+    # Maximum threads per block is architecture-dependent in theory, but in reality all are 1024
+    max_threads_per_block = 1024
+    # Default to warp size 32 if not defined by device
+    warp_size = getattr(torch.cuda.get_device_properties(torch.cuda.current_device()), "warp_size", 32)
+    # Autotune for warp counts which are powers of 2 and do not exceed thread per block limit
+    return [triton.Config({}, num_warps=warp_count) for warp_count in [1, 2, 4, 8, 16, 32]
+            if warp_count * warp_size <= max_threads_per_block]
+    # return [triton.Config({}, num_warps=8)]
 def layer_norm_ref(
     x,
     dropout_p=0.0,
     rowscale=None,
     prenorm=False,
+    zero_centered_weight=False,
     dropout_mask=None,
     dropout_mask1=None,
     upcast=False,
         x1 = x1.float() if x1 is not None else None
         weight1 = weight1.float() if weight1 is not None else None
         bias1 = bias1.float() if bias1 is not None else None
+    if zero_centered_weight:
+        weight = weight + 1.0
+        if weight1 is not None:
+            weight1 = weight1 + 1.0
     if x1 is not None:
         assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
     if rowscale is not None:
         x = x + x1
     if residual is not None:
         x = (x + residual).to(x.dtype)
+    out = F.layer_norm(x.to(weight.dtype), x.shape[-1:], weight=weight, bias=bias, eps=eps).to(
+        dtype
+    )
     if weight1 is None:
         return out if not prenorm else (out, x)
     else:
     dropout_p=0.0,
     rowscale=None,
     prenorm=False,
+    zero_centered_weight=False,
     dropout_mask=None,
     dropout_mask1=None,
     upcast=False,
         x1 = x1.float() if x1 is not None else None
         weight1 = weight1.float() if weight1 is not None else None
         bias1 = bias1.float() if bias1 is not None else None
+    if zero_centered_weight:
+        weight = weight + 1.0
+        if weight1 is not None:
+            weight1 = weight1 + 1.0
     if x1 is not None:
         assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
     if rowscale is not None:
     if residual is not None:
         x = (x + residual).to(x.dtype)
     rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps)
+    out = ((x * rstd * weight) + bias if bias is not None else (x * rstd * weight)).to(dtype)
     if weight1 is None:
         return out if not prenorm else (out, x)
     else:
+        out1 = ((x * rstd * weight1) + bias1 if bias1 is not None else (x * rstd * weight1)).to(
+            dtype
+        )
         return (out, out1) if not prenorm else (out, out1, x)
 @triton.autotune(
+    configs=triton_autotune_configs(),
+    key=["N", "HAS_RESIDUAL", "STORE_RESIDUAL_OUT", "IS_RMS_NORM", "HAS_BIAS", "HAS_X1", "HAS_W1", "HAS_B1"],
 )
+# torch compile doesn't like triton.heuristics, so we set these manually when calling the kernel
 # @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
 # @triton.heuristics({"HAS_RESIDUAL": lambda args: args["RESIDUAL"] is not None})
+# @triton.heuristics({"HAS_X1": lambda args: args["X1"] is not None})
+# @triton.heuristics({"HAS_W1": lambda args: args["W1"] is not None})
+# @triton.heuristics({"HAS_B1": lambda args: args["B1"] is not None})
 @triton.jit
 def _layer_norm_fwd_1pass_kernel(
     X,  # pointer to the input
     ROWSCALE,
     SEEDS,  # Dropout seeds for each row
     DROPOUT_MASK,
+    DROPOUT_MASK1,
     Mean,  # pointer to the mean
     Rstd,  # pointer to the 1/std
     stride_x_row,  # how much to increase the pointer when moving by 1 row
     N,  # number of columns in X
     eps,  # epsilon to avoid division by zero
     dropout_p,  # Dropout probability
+    zero_centered_weight,  # If true, add 1.0 to the weight
     IS_RMS_NORM: tl.constexpr,
     BLOCK_N: tl.constexpr,
     HAS_RESIDUAL: tl.constexpr,
     if HAS_DROPOUT:
         # Compute dropout mask
         # 7 rounds is good enough, and reduces register pressure
+        keep_mask = tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
         x = tl.where(keep_mask, x / (1.0 - dropout_p), 0.0)
         if STORE_DROPOUT_MASK:
             tl.store(DROPOUT_MASK + row * N + cols, keep_mask, mask=cols < N)
             # Compute dropout mask
             # 7 rounds is good enough, and reduces register pressure
             keep_mask = (
+                tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
             )
             x1 = tl.where(keep_mask, x1 / (1.0 - dropout_p), 0.0)
             if STORE_DROPOUT_MASK:
+                tl.store(DROPOUT_MASK1 + row * N + cols, keep_mask, mask=cols < N)
         x += x1
     if HAS_RESIDUAL:
         residual = tl.load(RESIDUAL + cols, mask=cols < N, other=0.0).to(tl.float32)
     # Normalize and apply linear transformation
     mask = cols < N
     w = tl.load(W + cols, mask=mask).to(tl.float32)
+    if zero_centered_weight:
+        w += 1.0
     if HAS_BIAS:
         b = tl.load(B + cols, mask=mask).to(tl.float32)
     x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
     tl.store(Y + cols, y, mask=mask)
     if HAS_W1:
         w1 = tl.load(W1 + cols, mask=mask).to(tl.float32)
+        if zero_centered_weight:
+            w1 += 1.0
         if HAS_B1:
             b1 = tl.load(B1 + cols, mask=mask).to(tl.float32)
         y1 = x_hat * w1 + b1 if HAS_B1 else x_hat * w1
 def _layer_norm_fwd(
+    x: Tensor,
+    weight: Tensor,
+    bias: Tensor,
+    eps: float,
+    residual: Optional[Tensor] = None,
+    x1: Optional[Tensor] = None,
+    weight1: Optional[Tensor] = None,
+    bias1: Optional[Tensor] = None,
+    dropout_p: float = 0.0,
+    rowscale: Optional[Tensor] = None,
+    out_dtype: Optional[torch.dtype] = None,
+    residual_dtype: Optional[torch.dtype] = None,
+    zero_centered_weight: bool = False,
+    is_rms_norm: bool = False,
+    return_dropout_mask: bool = False,
+    out: Optional[Tensor] = None,
+    residual_out: Optional[Tensor] = None
+) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor):
+    # Need to wrap to handle the case where residual_out is a alias of x, which makes torch.library
+    # and torch.compile unhappy. Also allocate memory for out and residual_out if they are None
+    # so that _layer_norm_fwd_impl doesn't have to return them.
+    if out is None:
+        out = torch.empty_like(x, dtype=x.dtype if out_dtype is None else out_dtype)
     if residual is not None:
         residual_dtype = residual.dtype
+    if residual_out is None and (
+        residual is not None
+        or (residual_dtype is not None and residual_dtype != x.dtype)
+        or dropout_p > 0.0
+        or rowscale is not None
+        or x1 is not None
+    ):
+        residual_out = torch.empty_like(
+            x, dtype=residual_dtype if residual_dtype is not None else x.dtype
+        )
+    else:
+        residual_out = None
+    y1, mean, rstd, seeds, dropout_mask, dropout_mask1 = _layer_norm_fwd_impl(
+        x,
+        weight,
+        bias,
+        eps,
+        out,
+        residual=residual,
+        x1=x1,
+        weight1=weight1,
+        bias1=bias1,
+        dropout_p=dropout_p,
+        rowscale=rowscale,
+        zero_centered_weight=zero_centered_weight,
+        is_rms_norm=is_rms_norm,
+        return_dropout_mask=return_dropout_mask,
+        residual_out=residual_out,
+    )
+    # residual_out is None if residual is None and residual_dtype == input_dtype and dropout_p == 0.0
+    if residual_out is None:
+        residual_out = x
+    return out, y1, mean, rstd, residual_out, seeds, dropout_mask, dropout_mask1
+# [2025-04-28] torch.library.triton_op ignores the schema argument, but here we need the schema
+# since we're returning a tuple of tensors
+@triton_op(add_op_namespace_prefix("layer_norm_fwd_impl"), mutates_args={"out", "residual_out"},
+           schema="(Tensor x, Tensor weight, Tensor bias, float eps, Tensor(a!) out, Tensor? residual, Tensor? x1, Tensor? weight1, Tensor? bias1, float dropout_p, Tensor? rowscale, bool zero_centered_weight, bool is_rms_norm, bool return_dropout_mask, Tensor(a!)? residual_out) -> (Tensor y1, Tensor mean, Tensor rstd, Tensor seeds, Tensor dropout_mask, Tensor dropout_mask1)")
+def _layer_norm_fwd_impl(
+    x: Tensor,
+    weight: Tensor,
+    bias: Tensor,
+    eps: float,
+    out: Tensor,
+    residual: Optional[Tensor] = None,
+    x1: Optional[Tensor] = None,
+    weight1: Optional[Tensor] = None,
+    bias1: Optional[Tensor] = None,
+    dropout_p: float = 0.0,
+    rowscale: Optional[Tensor] = None,
+    zero_centered_weight: bool = False,
+    is_rms_norm: bool = False,
+    return_dropout_mask: bool = False,
+    residual_out: Optional[Tensor] = None
+) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor):
     M, N = x.shape
     assert x.stride(-1) == 1
     if residual is not None:
     if rowscale is not None:
         assert rowscale.is_contiguous()
         assert rowscale.shape == (M,)
+    assert out.shape == x.shape
     assert out.stride(-1) == 1
+    if residual_out is not None:
+        assert residual_out.shape == x.shape
+        assert residual_out.stride(-1) == 1
     if weight1 is not None:
         y1 = torch.empty_like(out)
         assert y1.stride(-1) == 1
     else:
         y1 = None
+    mean = torch.empty((M,), dtype=torch.float32, device=x.device) if not is_rms_norm else None
     rstd = torch.empty((M,), dtype=torch.float32, device=x.device)
     if dropout_p > 0.0:
         seeds = torch.randint(
     else:
         seeds = None
     if return_dropout_mask and dropout_p > 0.0:
+        dropout_mask = torch.empty(M, N, device=x.device, dtype=torch.bool)
+        if x1 is not None:
+            dropout_mask1 = torch.empty(M, N, device=x.device, dtype=torch.bool)
+        else:
+            dropout_mask1 = None
     else:
+        dropout_mask, dropout_mask1 = None, None
     # Less than 64KB per feature: enqueue fused kernel
     MAX_FUSED_SIZE = 65536 // x.element_size()
     BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
     if N > BLOCK_N:
         raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
     with torch.cuda.device(x.device.index):
+        torch.library.wrap_triton(_layer_norm_fwd_1pass_kernel)[(M,)](
             x,
             out,
             weight,
             rowscale,
             seeds,
             dropout_mask,
+            dropout_mask1,
             mean,
             rstd,
             x.stride(0),
             N,
             eps,
             dropout_p,
+            # Passing bool make torch inductor very unhappy since it then tries to compare to int_max
+            int(zero_centered_weight),
             is_rms_norm,
             BLOCK_N,
             residual is not None,
             dropout_p > 0.0,
             dropout_mask is not None,
             rowscale is not None,
+            HAS_X1=x1 is not None,
+            HAS_W1=weight1 is not None,
+            HAS_B1=bias1 is not None,
         )
+    return y1, mean, rstd, seeds, dropout_mask, dropout_mask1
 @triton.autotune(
+    configs=triton_autotune_configs(),
+    key=["N", "HAS_DRESIDUAL", "STORE_DRESIDUAL", "IS_RMS_NORM", "HAS_BIAS", "HAS_DROPOUT"],
 )
+# torch compile doesn't like triton.heuristics, so we set these manually when calling the kernel
 # @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
 # @triton.heuristics({"HAS_DRESIDUAL": lambda args: args["DRESIDUAL"] is not None})
 # @triton.heuristics({"STORE_DRESIDUAL": lambda args: args["DRESIDUAL_IN"] is not None})
+# @triton.heuristics({"HAS_ROWSCALE": lambda args: args["ROWSCALE"] is not None})
+# @triton.heuristics({"HAS_DY1": lambda args: args["DY1"] is not None})
+# @triton.heuristics({"HAS_DX1": lambda args: args["DX1"] is not None})
+# @triton.heuristics({"HAS_B1": lambda args: args["DB1"] is not None})
+# @triton.heuristics({"RECOMPUTE_OUTPUT": lambda args: args["Y"] is not None})
 @triton.jit
 def _layer_norm_bwd_kernel(
     X,  # pointer to the input
     N,  # number of columns in X
     eps,  # epsilon to avoid division by zero
     dropout_p,
+    zero_centered_weight,
     rows_per_program,
     IS_RMS_NORM: tl.constexpr,
     BLOCK_N: tl.constexpr,
     if RECOMPUTE_OUTPUT:
         Y += row_start * stride_y_row
     w = tl.load(W + cols, mask=mask).to(tl.float32)
+    if zero_centered_weight:
+        w += 1.0
     if RECOMPUTE_OUTPUT and HAS_BIAS:
         b = tl.load(B + cols, mask=mask, other=0.0).to(tl.float32)
     if HAS_DY1:
         w1 = tl.load(W1 + cols, mask=mask).to(tl.float32)
+        if zero_centered_weight:
+            w1 += 1.0
     dw = tl.zeros((BLOCK_N,), dtype=tl.float32)
     if HAS_BIAS:
         db = tl.zeros((BLOCK_N,), dtype=tl.float32)
         if HAS_DX1:
             if HAS_DROPOUT:
                 keep_mask = (
+                    tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
                 )
                 dx1 = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
             else:
                 dx1 = dx
             tl.store(DX1 + cols, dx1, mask=mask)
         if HAS_DROPOUT:
+            keep_mask = tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
             dx = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
         if HAS_ROWSCALE:
             rowscale = tl.load(ROWSCALE + row).to(tl.float32)
 def _layer_norm_bwd(
+    dy: Tensor,
+    x: Tensor,
+    weight: Tensor,
+    bias: Tensor,
+    eps: float,
+    mean: Tensor,
+    rstd: Tensor,
+    dresidual: Optional[Tensor] = None,
+    dy1: Optional[Tensor] = None,
+    weight1: Optional[Tensor] = None,
+    bias1: Optional[Tensor] = None,
+    seeds: Optional[Tensor] = None,
+    dropout_p: float = 0.0,
+    rowscale: Optional[Tensor] = None,
+    has_residual: bool = False,
+    has_x1: bool = False,
+    zero_centered_weight: bool = False,
+    is_rms_norm: bool = False,
+    x_dtype: Optional[torch.dtype] = None,
+    recompute_output: bool = False,
+) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor):
+    # Need to wrap to handle the case where dresidual_in or dx1 are aliases of x,
+    # which makes torch.library unhappy
+    dx, dw, db, dresidual_in, dx1, dw1, db1, y = _layer_norm_bwd_impl(
+        dy,
+        x,
+        weight,
+        bias,
+        eps,
+        mean,
+        rstd,
+        dresidual,
+        dy1,
+        weight1,
+        bias1,
+        seeds,
+        dropout_p,
+        rowscale,
+        has_residual,
+        has_x1,
+        zero_centered_weight,
+        is_rms_norm,
+        x_dtype=x_dtype,
+        recompute_output=recompute_output,
+    )
+    # Don't need to compute dresidual_in separately in this case
+    if has_residual and dx.dtype == x.dtype and dropout_p == 0.0 and rowscale is None:
+        dresidual_in = dx
+    if has_x1 and dropout_p == 0.0:
+        dx1 = dx
+    return dx, dw, db, dresidual_in, dx1, dw1, db1, y
+@triton_op(add_op_namespace_prefix("layer_norm_bwd_impl"), mutates_args={},
+           schema="(Tensor dy, Tensor x, Tensor weight, Tensor bias, float eps, Tensor mean, Tensor rstd, Tensor? dresidual, Tensor? dy1, Tensor? weight1, Tensor? bias1, Tensor? seeds, float dropout_p, Tensor? rowscale, bool has_residual, bool has_x1, bool zero_centered_weight, bool is_rms_norm, ScalarType? x_dtype, bool recompute_output) -> (Tensor dx, Tensor dw, Tensor db, Tensor dresidual_in, Tensor dx1, Tensor dw1, Tensor db1, Tensor y)",
+           allow_decomposition=False,  # Don't let torch.compile trace inside
+           )
+def _layer_norm_bwd_impl(
+    dy: Tensor,
+    x: Tensor,
+    weight: Tensor,
+    bias: Tensor,
+    eps: float,
+    mean: Tensor,
+    rstd: Tensor,
+    dresidual: Optional[Tensor] = None,
+    dy1: Optional[Tensor] = None,
+    weight1: Optional[Tensor] = None,
+    bias1: Optional[Tensor] = None,
+    seeds: Optional[Tensor] = None,
+    dropout_p: float = 0.0,
+    rowscale: Optional[Tensor] = None,
+    has_residual: bool = False,
+    has_x1: bool = False,
+    zero_centered_weight: bool = False,
+    is_rms_norm: bool = False,
+    x_dtype: Optional[torch.dtype] = None,
+    recompute_output: bool = False,
+) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor):
     M, N = x.shape
     assert x.stride(-1) == 1
+    dy = maybe_contiguous_lastdim(dy)
     assert dy.stride(-1) == 1
     assert dy.shape == (M, N)
     if dresidual is not None:
+        dresidual = maybe_contiguous_lastdim(dresidual)
         assert dresidual.stride(-1) == 1
         assert dresidual.shape == (M, N)
     assert weight.shape == (N,)
         assert bias.stride(-1) == 1
         assert bias.shape == (N,)
     if dy1 is not None:
+        dy1 = maybe_contiguous_lastdim(dy1)
         assert weight1 is not None
         assert dy1.shape == dy.shape
         assert dy1.stride(-1) == 1
         else None
     )
     dx1 = torch.empty_like(dx) if (has_x1 and dropout_p > 0.0) else None
+    y = torch.empty(M, N, dtype=dy.dtype, device=dy.device) if recompute_output else None
     if recompute_output:
+        assert weight1 is None, "recompute_output is not supported with parallel LayerNorm"
     # Less than 64KB per feature: enqueue fused kernel
     MAX_FUSED_SIZE = 65536 // x.element_size()
     BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
     if N > BLOCK_N:
         raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+    # Increasing the multiple (e.g. 8) will allow more thread blocks to be launched and hide the
+    # latency of the gmem reads/writes, but will increase the time of summing up dw / db.
+    sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count * 8
     _dw = torch.empty((sm_count, N), dtype=torch.float32, device=weight.device)
     _db = (
         torch.empty((sm_count, N), dtype=torch.float32, device=bias.device)
     rows_per_program = math.ceil(M / sm_count)
     grid = (sm_count,)
     with torch.cuda.device(x.device.index):
+        torch.library.wrap_triton(_layer_norm_bwd_kernel)[grid](
             x,
             weight,
             bias,
             N,
             eps,
             dropout_p,
+            # Passing bool make torch inductor very unhappy since it then tries to compare to int_max
+            int(zero_centered_weight),
             rows_per_program,
             is_rms_norm,
             BLOCK_N,
             dresidual_in is not None,
             bias is not None,
             dropout_p > 0.0,
+            HAS_ROWSCALE=rowscale is not None,
+            HAS_DY1=dy1 is not None,
+            HAS_DX1=dx1 is not None,
+            HAS_B1=bias1 is not None,
+            RECOMPUTE_OUTPUT=y is not None,
         )
     dw = _dw.sum(0).to(weight.dtype)
     db = _db.sum(0).to(bias.dtype) if bias is not None else None
     dw1 = _dw1.sum(0).to(weight1.dtype) if weight1 is not None else None
     db1 = _db1.sum(0).to(bias1.dtype) if bias1 is not None else None
+    # dresidual_in and dx1 could be None, the wrapper will handle assigning them from dx
+    return dx, dw, db, dresidual_in, dx1, dw1, db1, y
 class LayerNormFn(torch.autograd.Function):
     @staticmethod
     def forward(
         ctx,
         rowscale=None,
         prenorm=False,
         residual_in_fp32=False,
+        zero_centered_weight=False,
         is_rms_norm=False,
         return_dropout_mask=False,
+        out_dtype=None,
         out=None,
+        residual_out=None
     ):
         x_shape_og = x.shape
         # reshape input data into 2D tensor
+        x = maybe_contiguous_lastdim(x.reshape(-1, x.shape[-1]))
         if residual is not None:
             assert residual.shape == x_shape_og
+            residual = maybe_contiguous_lastdim(residual.reshape(-1, residual.shape[-1]))
         if x1 is not None:
             assert x1.shape == x_shape_og
             assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
+            x1 = maybe_contiguous_lastdim(x1.reshape(-1, x1.shape[-1]))
         weight = weight.contiguous()
+        bias = maybe_contiguous(bias)
+        weight1 = maybe_contiguous(weight1)
+        bias1 = maybe_contiguous(bias1)
         if rowscale is not None:
             rowscale = rowscale.reshape(-1).contiguous()
         residual_dtype = (
             out = out.reshape(-1, out.shape[-1])
         if residual_out is not None:
             residual_out = residual_out.reshape(-1, residual_out.shape[-1])
+        y, y1, mean, rstd, residual_out, seeds, dropout_mask, dropout_mask1 = _layer_norm_fwd(
+            x,
+            weight,
+            bias,
+            eps,
+            residual,
+            x1,
+            weight1,
+            bias1,
+            dropout_p=dropout_p,
+            rowscale=rowscale,
+            out_dtype=out_dtype,
+            residual_dtype=residual_dtype,
+            zero_centered_weight=zero_centered_weight,
+            is_rms_norm=is_rms_norm,
+            return_dropout_mask=return_dropout_mask,
+            out=out,
+            residual_out=residual_out,
         )
         ctx.save_for_backward(
             residual_out, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd
         ctx.has_x1 = x1 is not None
         ctx.prenorm = prenorm
         ctx.x_dtype = x.dtype
+        ctx.zero_centered_weight = zero_centered_weight
         y = y.reshape(x_shape_og)
         y1 = y1.reshape(x_shape_og) if y1 is not None else None
+        residual_out = residual_out.reshape(x_shape_og) if residual_out is not None else None
+        dropout_mask = dropout_mask.reshape(x_shape_og) if dropout_mask is not None else None
+        dropout_mask1 = dropout_mask1.reshape(x_shape_og) if dropout_mask1 is not None else None
         if not return_dropout_mask:
             if weight1 is None:
                 return y if not prenorm else (y, residual_out)
     def backward(ctx, dy, *args):
         x, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd = ctx.saved_tensors
         dy = dy.reshape(-1, dy.shape[-1])
         if weight1 is not None:
             dy1, args = args[0], args[1:]
             dy1 = dy1.reshape(-1, dy1.shape[-1])
             assert dy1.shape == x.shape
         else:
             dy1 = None
         if ctx.prenorm:
             dresidual = args[0]
             dresidual = dresidual.reshape(-1, dresidual.shape[-1])
             assert dresidual.shape == x.shape
         else:
             dresidual = None
+        dx, dw, db, dresidual_in, dx1, dw1, db1, _ = _layer_norm_bwd(
             dy,
             x,
             weight,
             rowscale,
             ctx.has_residual,
             ctx.has_x1,
+            ctx.zero_centered_weight,
             ctx.is_rms_norm,
             x_dtype=ctx.x_dtype,
+            recompute_output=False,
         )
         return (
             dx.reshape(ctx.x_shape_og),
             None,
             None,
             None,
+            None,
+            None,
         )
     rowscale=None,
     prenorm=False,
     residual_in_fp32=False,
+    zero_centered_weight=False,
     is_rms_norm=False,
     return_dropout_mask=False,
+    out_dtype=None,
     out=None,
+    residual_out=None
 ):
     return LayerNormFn.apply(
         x,
         rowscale,
         prenorm,
         residual_in_fp32,
+        zero_centered_weight,
         is_rms_norm,
         return_dropout_mask,
+        out_dtype,
         out,
+        residual_out
     )
     rowscale=None,
     prenorm=False,
     residual_in_fp32=False,
+    zero_centered_weight=False,
     return_dropout_mask=False,
+    out_dtype=None,
     out=None,
+    residual_out=None
 ):
     return LayerNormFn.apply(
         x,
         rowscale,
         prenorm,
         residual_in_fp32,
+        zero_centered_weight,
         True,
         return_dropout_mask,
+        out_dtype,
         out,
+        residual_out
     )
 class RMSNorm(torch.nn.Module):
+    def __init__(self, hidden_size, eps=1e-5, dropout_p=0.0, zero_centered_weight=False,
+                 device=None, dtype=None):
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
         self.eps = eps
             self.drop = torch.nn.Dropout(dropout_p)
         else:
             self.drop = None
+        self.zero_centered_weight = zero_centered_weight
         self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
         self.register_parameter("bias", None)
         self.reset_parameters()
     def reset_parameters(self):
+        if not self.zero_centered_weight:
+            torch.nn.init.ones_(self.weight)
+        else:
+            torch.nn.init.zeros_(self.weight)
     def forward(self, x, residual=None, prenorm=False, residual_in_fp32=False):
         return rms_norm_fn(
             dropout_p=self.drop.p if self.drop is not None and self.training else 0.0,
             prenorm=prenorm,
             residual_in_fp32=residual_in_fp32,
+            zero_centered_weight=self.zero_centered_weight,
         )
 class LayerNormLinearFn(torch.autograd.Function):
     @staticmethod
+    @custom_fwd
     def forward(
         ctx,
         x,
     ):
         x_shape_og = x.shape
         # reshape input data into 2D tensor
+        x = maybe_contiguous_lastdim(x.reshape(-1, x.shape[-1]))
         if residual is not None:
             assert residual.shape == x_shape_og
+            residual = maybe_contiguous_lastdim(residual.reshape(-1, residual.shape[-1]))
         norm_weight = norm_weight.contiguous()
+        norm_bias = maybe_contiguous(norm_bias)
         residual_dtype = (
             residual.dtype
             if residual is not None
             norm_bias,
             eps,
             residual,
+            out_dtype=None if not torch.is_autocast_enabled() else torch.get_autocast_dtype("cuda"),
             residual_dtype=residual_dtype,
             is_rms_norm=is_rms_norm,
         )
         y = y.reshape(x_shape_og)
+        dtype = torch.get_autocast_dtype("cuda") if torch.is_autocast_enabled() else y.dtype
         linear_weight = linear_weight.to(dtype)
         linear_bias = linear_bias.to(dtype) if linear_bias is not None else None
         out = F.linear(y.to(linear_weight.dtype), linear_weight, linear_bias)
         # We don't store y, will be recomputed in the backward pass to save memory
+        ctx.save_for_backward(residual_out, norm_weight, norm_bias, linear_weight, mean, rstd)
         ctx.x_shape_og = x_shape_og
         ctx.eps = eps
         ctx.is_rms_norm = is_rms_norm
         return out if not prenorm else (out, residual_out.reshape(x_shape_og))
     @staticmethod
+    @custom_bwd
     def backward(ctx, dout, *args):
         x, norm_weight, norm_bias, linear_weight, mean, rstd = ctx.saved_tensors
         dout = dout.reshape(-1, dout.shape[-1])
         dy = F.linear(dout, linear_weight.t())
         dlinear_bias = None if ctx.linear_bias_is_none else dout.sum(0)
+        dy = maybe_contiguous_lastdim(dy)
         assert dy.shape == x.shape
         if ctx.prenorm:
             dresidual = args[0]
+            dresidual = maybe_contiguous_lastdim(dresidual.reshape(-1, dresidual.shape[-1]))
             assert dresidual.shape == x.shape
         else:
             dresidual = None

torch-ext/triton_layer_norm/utils/__init__.py ADDED Viewed

File without changes

torch-ext/triton_layer_norm/utils/library.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# Adapted from https://github.com/pytorch/pytorch/blob/v2.7.0/torch/_library/triton.py
+# The PyTorch implementation simply ignores the schema argument, we simply modify it to use schema.
+from typing import Optional, Callable, Iterable, Union
+from torch.library import custom_op, CustomOpDef
+from torch._library.triton import set_wrap_triton_enabled
+def triton_op(
+    name: str,
+    fn: Optional[Callable] = None,
+    /,
+    *,
+    mutates_args: Union[str, Iterable[str]],
+    schema: Optional[str] = None,
+    # If allow_decomposition=True, this matches torch.library.triton_op behavior. If set to False,
+    # then it behaves like torch.library.custom_op instead, which doesn't decompose the operator
+    # and so inductor can't trace inside.
+    allow_decomposition=True,
+) -> Callable:
+    def dec(fn: Callable[..., object]) -> CustomOpDef:
+        def backend_fn(*args, **kwargs):  # type: ignore[no-untyped-def]
+            # Optimization: we're passing regular Tensors into the triton kernel, so
+            # no need to go through HOP dispatch
+            with set_wrap_triton_enabled(False):
+                return fn(*args, **kwargs)
+        result = custom_op(
+            name,
+            backend_fn,
+            mutates_args=mutates_args,
+            # This is the only difference with the PyTorch implementation
+            schema=schema,
+        )
+        from torch._subclasses.functional_tensor import FunctionalTensorMode
+        # We require that the user pass us a function that is make_fx traceable,
+        # so we can just register it as the Fake/meta kernel.
+        result.register_fake(fn)
+        if allow_decomposition:
+            # We decompose the operator when FunctionalTensorMode is active.
+            # The goal is to decompose the operator in AOTDispatcher.
+            # - With torch.compile, this means that the backend (usually Inductor)
+            #   can see a call to the triton kernel(s) and so it can directly optimize
+            #   them by inlining them into the lowering process.
+            def functional_decomp(  # type: ignore[no-untyped-def]
+                mode, op, types, args, kwargs
+            ):
+                from torch.export._trace import custom_triton_ops_decomposition_disabled
+                if custom_triton_ops_decomposition_disabled():
+                    return mode.__torch_dispatch__(op, types, args, kwargs)
+                else:
+                    with mode:
+                        return fn(*args, **kwargs)
+            result.register_torch_dispatch(FunctionalTensorMode, functional_decomp)
+        return result
+    if fn is None:
+        return dec
+    else:
+        return dec(fn)

torch-ext/triton_layer_norm/utils/torch.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import torch
+from typing import Callable
+def custom_amp_decorator(dec: Callable, cuda_amp_deprecated: bool):
+    def decorator(*args, **kwargs):
+        if cuda_amp_deprecated:
+            kwargs["device_type"] = "cuda"
+        return dec(*args, **kwargs)
+    return decorator
+if hasattr(torch.amp, "custom_fwd"): # type: ignore[attr-defined]
+    deprecated = True
+    from torch.amp import custom_fwd, custom_bwd # type: ignore[attr-defined]
+else:
+    deprecated = False
+    from torch.cuda.amp import custom_fwd, custom_bwd
+custom_fwd = custom_amp_decorator(custom_fwd, deprecated)
+custom_bwd = custom_amp_decorator(custom_bwd, deprecated)