TaehyunKim

TaehyunKimMotif commited on Sep 9

Commit

a1e5ca8

unverified ·

1 Parent(s): e5e2eeb

Fix fused add rms norm (#4)

* make fused add rms norm return 2 outputs

* add builds

* add plot

---------

Co-authored-by: taehyun <taehyun.kim@motiftech.io>

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +4 -2
activation/fused_add_rms_norm.cu +221 -3
activation/fused_mul_poly_norm.cu +4 -4
benchmarks/cases/add_rms.py +2 -1
benchmarks/common/bench_framework.py +8 -2
benchmarks/common/diff_engine.py +4 -1
benchmarks/plots/h100/add_rms/plot_add_rms-bwd-perf.png +0 -0
benchmarks/plots/h100/add_rms/plot_add_rms-fwd-perf.png +0 -0
benchmarks/plots/h100/mul_poly/plot_mul_poly-bwd-perf.png +0 -0
benchmarks/plots/h100/mul_poly/plot_mul_poly-fwd-perf.png +0 -0
benchmarks/plots/h100/poly/plot_poly-bwd-perf.png +0 -0
benchmarks/plots/h100/poly/plot_poly-fwd-perf.png +0 -0
benchmarks/plots/h100/rms/plot_rms-bwd-perf.png +0 -0
benchmarks/plots/h100/rms/plot_rms-fwd-perf.png +0 -0
benchmarks/plots/mi250/add_rms/plot_add_rms-bwd-perf.png +0 -0
benchmarks/plots/mi250/add_rms/plot_add_rms-fwd-perf.png +0 -0
build/torch27-cxx11-cu118-x86_64-linux/activation/__init__.py +1 -1
build/torch27-cxx11-cu118-x86_64-linux/activation/_activation_e5e2eeb_dirty.abi3.so +3 -0
build/torch27-cxx11-cu118-x86_64-linux/activation/_ops.py +3 -3
build/torch27-cxx11-cu118-x86_64-linux/activation/layers.py +1 -1
build/torch27-cxx11-cu118-x86_64-linux/activation/rms_norm.py +2 -8
build/torch27-cxx11-cu126-x86_64-linux/activation/__init__.py +1 -1
build/torch27-cxx11-cu126-x86_64-linux/activation/_activation_e5e2eeb_dirty.abi3.so +3 -0
build/torch27-cxx11-cu126-x86_64-linux/activation/_ops.py +3 -3
build/torch27-cxx11-cu126-x86_64-linux/activation/layers.py +1 -1
build/torch27-cxx11-cu126-x86_64-linux/activation/rms_norm.py +2 -8
build/torch27-cxx11-cu128-x86_64-linux/activation/__init__.py +1 -1
build/torch27-cxx11-cu128-x86_64-linux/activation/_activation_e5e2eeb_dirty.abi3.so +3 -0
build/torch27-cxx11-cu128-x86_64-linux/activation/_ops.py +3 -3
build/torch27-cxx11-cu128-x86_64-linux/activation/layers.py +1 -1
build/torch27-cxx11-cu128-x86_64-linux/activation/rms_norm.py +2 -8
build/torch27-cxx11-rocm63-x86_64-linux/activation/__init__.py +1 -1
build/torch27-cxx11-rocm63-x86_64-linux/activation/_activation_e5e2eeb_dirty.abi3.so +3 -0
build/torch27-cxx11-rocm63-x86_64-linux/activation/_ops.py +3 -3
build/torch27-cxx11-rocm63-x86_64-linux/activation/layers.py +1 -1
build/torch27-cxx11-rocm63-x86_64-linux/activation/rms_norm.py +2 -8
build/torch28-cxx11-cu126-x86_64-linux/activation/__init__.py +1 -1
build/torch28-cxx11-cu126-x86_64-linux/activation/_activation_e5e2eeb_dirty.abi3.so +3 -0
build/torch28-cxx11-cu126-x86_64-linux/activation/_ops.py +3 -3
build/torch28-cxx11-cu126-x86_64-linux/activation/layers.py +1 -1
build/torch28-cxx11-cu126-x86_64-linux/activation/rms_norm.py +2 -8
build/torch28-cxx11-cu128-x86_64-linux/activation/__init__.py +1 -1
build/torch28-cxx11-cu128-x86_64-linux/activation/_activation_e5e2eeb_dirty.abi3.so +3 -0
build/torch28-cxx11-cu128-x86_64-linux/activation/_ops.py +3 -3
build/torch28-cxx11-cu128-x86_64-linux/activation/layers.py +1 -1
build/torch28-cxx11-cu128-x86_64-linux/activation/rms_norm.py +2 -8
build/torch28-cxx11-cu129-x86_64-linux/activation/__init__.py +1 -1
build/torch28-cxx11-cu129-x86_64-linux/activation/_activation_e5e2eeb_dirty.abi3.so +3 -0
build/torch28-cxx11-cu129-x86_64-linux/activation/_ops.py +3 -3
build/torch28-cxx11-cu129-x86_64-linux/activation/layers.py +1 -1

README.md CHANGED Viewed

@@ -18,13 +18,15 @@ Activation is a python package that contains custom CUDA-based activation kernel
       ```python
       y = x + residual
-      out = rms_norm(y, weight, eps)
       ```
     - Fused as:
       ```python
-      out = fused_add_rms_norm(x, residual, weight, eps)
       ```
   - **FusedMulPolyNorm**

       ```python
       y = x + residual
+      hidden_state = rms_norm(y, weight, eps)
+      out = y + some_op(hidden_state)
       ```
     - Fused as:
       ```python
+      hidden_state, y = fused_add_rms_norm(x, residual, weight, eps)
+      out = y + some_op(hidden_state)
       ```
   - **FusedMulPolyNorm**

activation/fused_add_rms_norm.cu CHANGED Viewed

@@ -117,9 +117,175 @@ fused_add_rms_norm_kernel(scalar_t *__restrict__ out,            // [..., d]
   }
 }
 } // namespace motif
-#define LAUNCH_RMS_NORM(width)                                                 \
   MOTIF_DISPATCH_FLOATING_TYPES(                                               \
       input.scalar_type(), "fused_add_rms_norm_kernel", [&] {                  \
         motif::fused_add_rms_norm_kernel<scalar_t, float, width>               \
@@ -150,8 +316,60 @@ void fused_add_rms_norm(torch::Tensor &out,            // [..., d]
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   if (d % 8 == 0) {
-    LAUNCH_RMS_NORM(8);
   } else {
-    LAUNCH_RMS_NORM(0);
   }
 }

   }
 }
+template <typename scalar_t, typename acc_t, int width>
+__global__ std::enable_if_t<(width > 0)> fused_add_rms_norm_backward_kernel(
+    scalar_t *__restrict__ input_grad,            // [..., d]
+    acc_t *__restrict__ temp_weight_grad,         // [..., d]
+    const scalar_t *__restrict__ output_grad,     // [..., d]
+    const scalar_t *__restrict__ add_output_grad, // [..., d]
+    const scalar_t *__restrict__ input,           // [..., d]
+    const scalar_t *__restrict__ weight,          // [d]
+    const float eps, const int d) {
+  using vec_t = type_vec_t<scalar_t, width>;
+  using dw_vec_t = type_vec_t<acc_t, width>;
+  const int64_t token_idx = blockIdx.x;
+  const int64_t vec_idx = threadIdx.x;
+  const int vec_d = d / width;
+  const int64_t vec_offset = token_idx * vec_d;
+  const vec_t *__restrict__ input_vec = reinterpret_cast<const vec_t *>(input);
+  const vec_t *__restrict__ output_grad_vec =
+      reinterpret_cast<const vec_t *>(output_grad);
+  const vec_t *__restrict__ weight_vec =
+      reinterpret_cast<const vec_t *>(weight);
+  acc_t d_sum = 0.0f;
+  acc_t sum_square = 0.0f;
+  for (int64_t vidx = vec_idx; vidx < vec_d; vidx += blockDim.x) {
+    vec_t x_vec = input_vec[vec_offset + vidx];
+    vec_t dy_vec = output_grad_vec[vec_offset + vidx];
+    vec_t w_vec = weight_vec[vidx];
+#pragma unroll
+    for (int i = 0; i < width; ++i) {
+      acc_t x = x_vec.data[i];
+      acc_t dy = dy_vec.data[i];
+      acc_t w = w_vec.data[i];
+      d_sum += dy * x * w;
+      sum_square += x * x;
+    }
+  }
+  using BlockReduce = cub::BlockReduce<float2, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  struct SumOp {
+    __device__ float2 operator()(const float2 &a, const float2 &b) const {
+      return make_float2(a.x + b.x, a.y + b.y);
+    }
+  };
+  float2 thread_sums = make_float2(d_sum, sum_square);
+  float2 block_sums =
+      BlockReduce(reduceStore).Reduce(thread_sums, SumOp{}, blockDim.x);
+  d_sum = block_sums.x;
+  sum_square = block_sums.y;
+  __shared__ acc_t s_scale;
+  __shared__ acc_t s_dxx;
+  if (threadIdx.x == 0) {
+    acc_t scale = rsqrtf(sum_square / d + eps);
+    s_dxx = d_sum * scale * scale * scale / d;
+    s_scale = scale;
+  }
+  __syncthreads();
+  acc_t scale = s_scale;
+  acc_t dxx = s_dxx;
+  vec_t *__restrict__ input_grad_vec = reinterpret_cast<vec_t *>(input_grad);
+  dw_vec_t *__restrict__ temp_weight_grad_vec =
+      reinterpret_cast<dw_vec_t *>(temp_weight_grad);
+  const vec_t *__restrict__ add_output_grad_vec =
+      reinterpret_cast<const vec_t *>(add_output_grad);
+  for (int64_t vidx = vec_idx; vidx < vec_d; vidx += blockDim.x) {
+    vec_t x_vec = input_vec[vec_offset + vidx];
+    vec_t dy_vec = output_grad_vec[vec_offset + vidx];
+    vec_t da_vec = add_output_grad_vec[vec_offset + vidx];
+    vec_t w_vec = weight_vec[vidx];
+    vec_t in_grad_vec;
+    dw_vec_t tw_grad_vec;
+#pragma unroll
+    for (int i = 0; i < width; ++i) {
+      acc_t x = x_vec.data[i];
+      acc_t dy = dy_vec.data[i];
+      acc_t w = w_vec.data[i];
+      if (input_grad) {
+        scalar_t da = da_vec.data[i];
+        scalar_t in_grad = scale * dy * w - dxx * x;
+        in_grad_vec.data[i] = in_grad + da;
+      }
+      tw_grad_vec.data[i] = dy * x * scale;
+    }
+    if (input_grad) {
+      input_grad_vec[vec_offset + vidx] = in_grad_vec;
+    }
+    temp_weight_grad_vec[vec_offset + vidx] = tw_grad_vec;
+  }
+}
+template <typename scalar_t, typename acc_t, int width>
+__global__ std::enable_if_t<(width == 0)> fused_add_rms_norm_backward_kernel(
+    scalar_t *__restrict__ input_grad,            // [..., d]
+    acc_t *__restrict__ temp_weight_grad,         // [..., d]
+    const scalar_t *__restrict__ output_grad,     // [..., d]
+    const scalar_t *__restrict__ add_output_grad, // [..., d]
+    const scalar_t *__restrict__ input,           // [..., d]
+    const scalar_t *__restrict__ weight,          // [d]
+    const float eps, const int d) {
+  const int64_t token_idx = blockIdx.x;
+  const int64_t vec_idx = threadIdx.x;
+  acc_t d_sum = 0.0f;
+  acc_t sum_square = 0.0f;
+  for (int64_t idx = vec_idx; idx < d; idx += blockDim.x) {
+    acc_t x = input[token_idx * d + idx];
+    acc_t dy = output_grad[token_idx * d + idx];
+    acc_t w = weight[idx];
+    d_sum += dy * x * w;
+    sum_square += x * x;
+  }
+  using BlockReduce = cub::BlockReduce<float2, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  struct SumOp {
+    __device__ float2 operator()(const float2 &a, const float2 &b) const {
+      return make_float2(a.x + b.x, a.y + b.y);
+    }
+  };
+  float2 thread_sums = make_float2(d_sum, sum_square);
+  float2 block_sums =
+      BlockReduce(reduceStore).Reduce(thread_sums, SumOp{}, blockDim.x);
+  d_sum = block_sums.x;
+  sum_square = block_sums.y;
+  __shared__ acc_t s_scale;
+  __shared__ acc_t s_dxx;
+  if (threadIdx.x == 0) {
+    acc_t scale = rsqrtf(sum_square / d + eps);
+    s_dxx = d_sum * scale * scale * scale / d;
+    s_scale = scale;
+  }
+  __syncthreads();
+  acc_t scale = s_scale;
+  acc_t dxx = s_dxx;
+  for (int64_t idx = vec_idx; idx < d; idx += blockDim.x) {
+    acc_t x = input[token_idx * d + idx];
+    acc_t dy = output_grad[token_idx * d + idx];
+    acc_t w = weight[idx];
+    if (input_grad) {
+      scalar_t da = add_output_grad[token_idx * d + idx];
+      scalar_t in_grad = scale * dy * w - dxx * x;
+      input_grad[token_idx * d + idx] = in_grad + da;
+    }
+    temp_weight_grad[token_idx * d + idx] = dy * x * scale;
+  }
+}
 } // namespace motif
+#define LAUNCH_FUSED_ADD_RMS_NORM(width)                                       \
   MOTIF_DISPATCH_FLOATING_TYPES(                                               \
       input.scalar_type(), "fused_add_rms_norm_kernel", [&] {                  \
         motif::fused_add_rms_norm_kernel<scalar_t, float, width>               \
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   if (d % 8 == 0) {
+    LAUNCH_FUSED_ADD_RMS_NORM(8);
+  } else {
+    LAUNCH_FUSED_ADD_RMS_NORM(0);
+  }
+}
+#define LAUNCH_FUSED_ADD_RMS_NORM_BWD(width)                                   \
+  MOTIF_DISPATCH_FLOATING_TYPES(                                               \
+      input.scalar_type(), "fused_add_rms_norm_backward_kernel", [&] {         \
+        motif::fused_add_rms_norm_backward_kernel<scalar_t, float, width>      \
+            <<<grid, block, 0, stream>>>(input_grad.data_ptr<scalar_t>(),      \
+                                         temp_weight_grad.data_ptr<float>(),   \
+                                         output_grad.data_ptr<scalar_t>(),     \
+                                         add_output_grad.data_ptr<scalar_t>(), \
+                                         input.data_ptr<scalar_t>(),           \
+                                         weight.data_ptr<scalar_t>(), eps, d); \
+      });
+void fused_add_rms_norm_backward(
+    torch::Tensor &input_grad,            // [..., d]
+    torch::Tensor &weight_grad,           // [d]
+    const torch::Tensor &output_grad,     // [..., d]
+    const torch::Tensor &add_output_grad, // [..., d]
+    const torch::Tensor &input,           // [..., d]
+    const torch::Tensor &weight,          // [d]
+    double eps) {
+  AssertTensorShapeEqual(input, input_grad, "input", "input_grad");
+  AssertTensorShapeEqual(input, output_grad, "input", "output_grad");
+  AssertTensorShapeEqual(input, output_grad, "input", "add_output_grad");
+  AssertTensorNotNull(weight, "weight");
+  // TODO shape check
+  // weight_grad, input_grad can be nullable
+  int d = input.size(-1);
+  int64_t num_tokens = input.numel() / input.size(-1);
+  dim3 grid(num_tokens);
+  const int max_block_size = (num_tokens < 256) ? 1024 : 256;
+  dim3 block(std::min(d, max_block_size));
+  torch::Tensor temp_weight_grad =
+      torch::empty({num_tokens, d}, input.options().dtype(torch::kFloat));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  if (d % 8 == 0) {
+    LAUNCH_FUSED_ADD_RMS_NORM_BWD(8);
   } else {
+    LAUNCH_FUSED_ADD_RMS_NORM_BWD(0);
+  }
+  if (weight_grad.defined()) {
+    torch::Tensor acc =
+        torch::empty_like(weight_grad, temp_weight_grad.options());
+    at::sum_out(acc, temp_weight_grad, {0});
+    weight_grad.copy_(acc);
   }
 }

activation/fused_mul_poly_norm.cu CHANGED Viewed

@@ -573,7 +573,7 @@ void fused_mul_poly_norm(torch::Tensor &out,          // [..., d]
   }
 }
-#define LAUNCH_POLY_NORM_BACKWARD(width)                                       \
   MOTIF_DISPATCH_FLOATING_TYPES(                                               \
       input.scalar_type(), "fused_mul_poly_norm_backward_kernel", [&] {        \
         motif::fused_mul_poly_norm_backward_kernel<scalar_t, float, width>     \
@@ -620,11 +620,11 @@ void fused_mul_poly_norm_backward(torch::Tensor &input_grad,        // [..., d]
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   if (d % 8 == 0 && input.element_size() == 2) {
-    LAUNCH_POLY_NORM_BACKWARD(8);
   } else if (d % 4 == 0 && input.element_size() == 4) {
-    LAUNCH_POLY_NORM_BACKWARD(4);
   } else {
-    LAUNCH_POLY_NORM_BACKWARD(0);
   }
   if (bias_grad.defined()) {

   }
 }
+#define LAUNCH_FUSED_MUL_POLY_NORM_BACKWARD(width)                             \
   MOTIF_DISPATCH_FLOATING_TYPES(                                               \
       input.scalar_type(), "fused_mul_poly_norm_backward_kernel", [&] {        \
         motif::fused_mul_poly_norm_backward_kernel<scalar_t, float, width>     \
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   if (d % 8 == 0 && input.element_size() == 2) {
+    LAUNCH_FUSED_MUL_POLY_NORM_BACKWARD(8);
   } else if (d % 4 == 0 && input.element_size() == 4) {
+    LAUNCH_FUSED_MUL_POLY_NORM_BACKWARD(4);
   } else {
+    LAUNCH_FUSED_MUL_POLY_NORM_BACKWARD(0);
   }
   if (bias_grad.defined()) {

benchmarks/cases/add_rms.py CHANGED Viewed

@@ -12,7 +12,8 @@ class FusedAddRMSNorm(torch.nn.Module):
         self.eps = eps
     def forward(self, x, residual):
-        return activation.rms_norm((x + residual), self.weight, self.eps)
 class AddRMS(DiffCase):

         self.eps = eps
     def forward(self, x, residual):
+        h = x + residual
+        return activation.rms_norm(h, self.weight, self.eps), h
 class AddRMS(DiffCase):

benchmarks/common/bench_framework.py CHANGED Viewed

@@ -149,7 +149,10 @@ def make_bwd_benchmark_for_case(
         obj = case.make_naive(I) if provider == "naive" else case.make_cuda(I)
         y = case.forward(obj, I)
         gin = list(case.grad_inputs(I)) + list(obj.parameters())
-        g = torch.randn_like(y)
         run = lambda: torch.autograd.grad(y,
                                           gin,
                                           g,
@@ -201,7 +204,10 @@ def make_bwd_benchmark_plot_for_case(
         obj = case.make_naive(I) if provider == "naive" else case.make_cuda(I)
         y = case.forward(obj, I)
         gin = list(case.grad_inputs(I)) + list(obj.parameters())
-        g = torch.randn_like(y)
         run = lambda: torch.autograd.grad(y,
                                           gin,
                                           g,

         obj = case.make_naive(I) if provider == "naive" else case.make_cuda(I)
         y = case.forward(obj, I)
         gin = list(case.grad_inputs(I)) + list(obj.parameters())
+        if isinstance(y, torch.Tensor):
+            g = [torch.randn_like(y)]
+        else:
+            g = [torch.randn_like(r) for r in y]
         run = lambda: torch.autograd.grad(y,
                                           gin,
                                           g,
         obj = case.make_naive(I) if provider == "naive" else case.make_cuda(I)
         y = case.forward(obj, I)
         gin = list(case.grad_inputs(I)) + list(obj.parameters())
+        if isinstance(y, torch.Tensor):
+            g = [torch.randn_like(y)]
+        else:
+            g = [torch.randn_like(r) for r in y]
         run = lambda: torch.autograd.grad(y,
                                           gin,
                                           g,

benchmarks/common/diff_engine.py CHANGED Viewed

@@ -68,7 +68,10 @@ def calculate_diff(
     torch.testing.assert_close(y_n, y_c, atol=atol, rtol=rtol)
     gin_n = list(case.grad_inputs(I_n)) + list(obj_n.parameters())
     gin_c = list(case.grad_inputs(I_c)) + list(obj_c.parameters())
-    g = _unit_grad_like(y_n).to(device)
     ng = torch.autograd.grad(y_n,
                              gin_n,
                              g,

     torch.testing.assert_close(y_n, y_c, atol=atol, rtol=rtol)
     gin_n = list(case.grad_inputs(I_n)) + list(obj_n.parameters())
     gin_c = list(case.grad_inputs(I_c)) + list(obj_c.parameters())
+    if isinstance(y_n, torch.Tensor):
+        g = [_unit_grad_like(y_n).to(device)]
+    else:
+        g = [_unit_grad_like(r).to(device) for r in y_n]
     ng = torch.autograd.grad(y_n,
                              gin_n,
                              g,

benchmarks/plots/h100/add_rms/plot_add_rms-bwd-perf.png CHANGED Viewed

benchmarks/plots/h100/add_rms/plot_add_rms-fwd-perf.png CHANGED Viewed

benchmarks/plots/h100/mul_poly/plot_mul_poly-bwd-perf.png CHANGED Viewed

benchmarks/plots/h100/mul_poly/plot_mul_poly-fwd-perf.png CHANGED Viewed

benchmarks/plots/h100/poly/plot_poly-bwd-perf.png CHANGED Viewed

benchmarks/plots/h100/poly/plot_poly-fwd-perf.png CHANGED Viewed

benchmarks/plots/h100/rms/plot_rms-bwd-perf.png CHANGED Viewed

benchmarks/plots/h100/rms/plot_rms-fwd-perf.png CHANGED Viewed

benchmarks/plots/mi250/add_rms/plot_add_rms-bwd-perf.png CHANGED Viewed

benchmarks/plots/mi250/add_rms/plot_add_rms-fwd-perf.png CHANGED Viewed

build/torch27-cxx11-cu118-x86_64-linux/activation/__init__.py CHANGED Viewed

@@ -39,7 +39,7 @@ def fused_add_rms_norm(
     weight: torch.Tensor,
     eps: float = 1e-6,
 ) -> None:
-    return FusedAddRMSNormFunction.apply(x, residual, weight, eps)[0]
 __all__ = [

     weight: torch.Tensor,
     eps: float = 1e-6,
 ) -> None:
+    return FusedAddRMSNormFunction.apply(x, residual, weight, eps)
 __all__ = [

build/torch27-cxx11-cu118-x86_64-linux/activation/_activation_e5e2eeb_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ec9ea7edc8b27f7983e20d615ab470cef6b82975afc214becfddfd05a867a839
+size 8600336

build/torch27-cxx11-cu118-x86_64-linux/activation/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _activation_20250907180255
-ops = torch.ops._activation_20250907180255
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_20250907180255::{op_name}"

 import torch
+from . import _activation_e5e2eeb_dirty
+ops = torch.ops._activation_e5e2eeb_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_activation_e5e2eeb_dirty::{op_name}"

build/torch27-cxx11-cu118-x86_64-linux/activation/layers.py CHANGED Viewed

@@ -85,7 +85,7 @@ class FusedAddRMSNorm(nn.Module):
         residual: torch.Tensor,
     ):
         return FusedAddRMSNormFunction.apply(x, residual, self.weight,
-                                             self.eps)[0]
     def reset_parameters(self) -> None:
         """

         residual: torch.Tensor,
     ):
         return FusedAddRMSNormFunction.apply(x, residual, self.weight,
+                                             self.eps)
     def reset_parameters(self) -> None:
         """

build/torch27-cxx11-cu118-x86_64-linux/activation/rms_norm.py CHANGED Viewed

@@ -54,20 +54,14 @@ class FusedAddRMSNormFunction(torch.autograd.Function):
     def setup_context(ctx, inputs, outputs):
         _, _, weight, eps = inputs
         _, add_output = outputs
-        ctx.mark_non_differentiable(add_output)
-        ctx.set_materialize_grads(False)
         ctx.save_for_backward(weight, add_output)
         ctx.eps = eps
-    # This function only needs one gradient
     @staticmethod
-    def backward(ctx, output_grad, _):
         weight, add_output = ctx.saved_tensors
         eps = ctx.eps
-        if output_grad is None:
-            output_grad = torch.zeros_like(add_output)
         need_in = ctx.needs_input_grad[0]
         need_res = ctx.needs_input_grad[1]
@@ -76,7 +70,7 @@ class FusedAddRMSNormFunction(torch.autograd.Function):
         weight_grad = torch.empty_like(
             weight) if ctx.needs_input_grad[2] else None
-        ops.rms_norm_backward(grad, weight_grad, output_grad, add_output,
                               weight, eps)
         input_grad = grad if need_in else None
         residual_grad = grad if need_res else None

     def setup_context(ctx, inputs, outputs):
         _, _, weight, eps = inputs
         _, add_output = outputs
         ctx.save_for_backward(weight, add_output)
         ctx.eps = eps
     @staticmethod
+    def backward(ctx, output_grad, add_output_grad):
         weight, add_output = ctx.saved_tensors
         eps = ctx.eps
         need_in = ctx.needs_input_grad[0]
         need_res = ctx.needs_input_grad[1]
         weight_grad = torch.empty_like(
             weight) if ctx.needs_input_grad[2] else None
+        ops.fused_add_rms_norm_backward(grad, weight_grad, output_grad, add_output_grad, add_output,
                               weight, eps)
         input_grad = grad if need_in else None
         residual_grad = grad if need_res else None

build/torch27-cxx11-cu126-x86_64-linux/activation/__init__.py CHANGED Viewed

@@ -39,7 +39,7 @@ def fused_add_rms_norm(
     weight: torch.Tensor,
     eps: float = 1e-6,
 ) -> None:
-    return FusedAddRMSNormFunction.apply(x, residual, weight, eps)[0]
 __all__ = [

     weight: torch.Tensor,
     eps: float = 1e-6,
 ) -> None:
+    return FusedAddRMSNormFunction.apply(x, residual, weight, eps)
 __all__ = [

build/torch27-cxx11-cu126-x86_64-linux/activation/_activation_e5e2eeb_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5d3511410cdc288d2fafc500223ed2e625e360f50fa341809cf892fb2c822924
+size 8779000

build/torch27-cxx11-cu126-x86_64-linux/activation/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _activation_20250907180255
-ops = torch.ops._activation_20250907180255
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_20250907180255::{op_name}"

 import torch
+from . import _activation_e5e2eeb_dirty
+ops = torch.ops._activation_e5e2eeb_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_activation_e5e2eeb_dirty::{op_name}"

build/torch27-cxx11-cu126-x86_64-linux/activation/layers.py CHANGED Viewed

@@ -85,7 +85,7 @@ class FusedAddRMSNorm(nn.Module):
         residual: torch.Tensor,
     ):
         return FusedAddRMSNormFunction.apply(x, residual, self.weight,
-                                             self.eps)[0]
     def reset_parameters(self) -> None:
         """

         residual: torch.Tensor,
     ):
         return FusedAddRMSNormFunction.apply(x, residual, self.weight,
+                                             self.eps)
     def reset_parameters(self) -> None:
         """

build/torch27-cxx11-cu126-x86_64-linux/activation/rms_norm.py CHANGED Viewed

@@ -54,20 +54,14 @@ class FusedAddRMSNormFunction(torch.autograd.Function):
     def setup_context(ctx, inputs, outputs):
         _, _, weight, eps = inputs
         _, add_output = outputs
-        ctx.mark_non_differentiable(add_output)
-        ctx.set_materialize_grads(False)
         ctx.save_for_backward(weight, add_output)
         ctx.eps = eps
-    # This function only needs one gradient
     @staticmethod
-    def backward(ctx, output_grad, _):
         weight, add_output = ctx.saved_tensors
         eps = ctx.eps
-        if output_grad is None:
-            output_grad = torch.zeros_like(add_output)
         need_in = ctx.needs_input_grad[0]
         need_res = ctx.needs_input_grad[1]
@@ -76,7 +70,7 @@ class FusedAddRMSNormFunction(torch.autograd.Function):
         weight_grad = torch.empty_like(
             weight) if ctx.needs_input_grad[2] else None
-        ops.rms_norm_backward(grad, weight_grad, output_grad, add_output,
                               weight, eps)
         input_grad = grad if need_in else None
         residual_grad = grad if need_res else None

     def setup_context(ctx, inputs, outputs):
         _, _, weight, eps = inputs
         _, add_output = outputs
         ctx.save_for_backward(weight, add_output)
         ctx.eps = eps
     @staticmethod
+    def backward(ctx, output_grad, add_output_grad):
         weight, add_output = ctx.saved_tensors
         eps = ctx.eps
         need_in = ctx.needs_input_grad[0]
         need_res = ctx.needs_input_grad[1]
         weight_grad = torch.empty_like(
             weight) if ctx.needs_input_grad[2] else None
+        ops.fused_add_rms_norm_backward(grad, weight_grad, output_grad, add_output_grad, add_output,
                               weight, eps)
         input_grad = grad if need_in else None
         residual_grad = grad if need_res else None

build/torch27-cxx11-cu128-x86_64-linux/activation/__init__.py CHANGED Viewed

@@ -39,7 +39,7 @@ def fused_add_rms_norm(
     weight: torch.Tensor,
     eps: float = 1e-6,
 ) -> None:
-    return FusedAddRMSNormFunction.apply(x, residual, weight, eps)[0]
 __all__ = [

     weight: torch.Tensor,
     eps: float = 1e-6,
 ) -> None:
+    return FusedAddRMSNormFunction.apply(x, residual, weight, eps)
 __all__ = [

build/torch27-cxx11-cu128-x86_64-linux/activation/_activation_e5e2eeb_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:25efc9c32e4bd6609a8326025aad861cbf79b544893755fe44519c9df7224c40
+size 13818872

build/torch27-cxx11-cu128-x86_64-linux/activation/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _activation_20250907180255
-ops = torch.ops._activation_20250907180255
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_20250907180255::{op_name}"

 import torch
+from . import _activation_e5e2eeb_dirty
+ops = torch.ops._activation_e5e2eeb_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_activation_e5e2eeb_dirty::{op_name}"

build/torch27-cxx11-cu128-x86_64-linux/activation/layers.py CHANGED Viewed

@@ -85,7 +85,7 @@ class FusedAddRMSNorm(nn.Module):
         residual: torch.Tensor,
     ):
         return FusedAddRMSNormFunction.apply(x, residual, self.weight,
-                                             self.eps)[0]
     def reset_parameters(self) -> None:
         """

         residual: torch.Tensor,
     ):
         return FusedAddRMSNormFunction.apply(x, residual, self.weight,
+                                             self.eps)
     def reset_parameters(self) -> None:
         """

build/torch27-cxx11-cu128-x86_64-linux/activation/rms_norm.py CHANGED Viewed

@@ -54,20 +54,14 @@ class FusedAddRMSNormFunction(torch.autograd.Function):
     def setup_context(ctx, inputs, outputs):
         _, _, weight, eps = inputs
         _, add_output = outputs
-        ctx.mark_non_differentiable(add_output)
-        ctx.set_materialize_grads(False)
         ctx.save_for_backward(weight, add_output)
         ctx.eps = eps
-    # This function only needs one gradient
     @staticmethod
-    def backward(ctx, output_grad, _):
         weight, add_output = ctx.saved_tensors
         eps = ctx.eps
-        if output_grad is None:
-            output_grad = torch.zeros_like(add_output)
         need_in = ctx.needs_input_grad[0]
         need_res = ctx.needs_input_grad[1]
@@ -76,7 +70,7 @@ class FusedAddRMSNormFunction(torch.autograd.Function):
         weight_grad = torch.empty_like(
             weight) if ctx.needs_input_grad[2] else None
-        ops.rms_norm_backward(grad, weight_grad, output_grad, add_output,
                               weight, eps)
         input_grad = grad if need_in else None
         residual_grad = grad if need_res else None

     def setup_context(ctx, inputs, outputs):
         _, _, weight, eps = inputs
         _, add_output = outputs
         ctx.save_for_backward(weight, add_output)
         ctx.eps = eps
     @staticmethod
+    def backward(ctx, output_grad, add_output_grad):
         weight, add_output = ctx.saved_tensors
         eps = ctx.eps
         need_in = ctx.needs_input_grad[0]
         need_res = ctx.needs_input_grad[1]
         weight_grad = torch.empty_like(
             weight) if ctx.needs_input_grad[2] else None
+        ops.fused_add_rms_norm_backward(grad, weight_grad, output_grad, add_output_grad, add_output,
                               weight, eps)
         input_grad = grad if need_in else None
         residual_grad = grad if need_res else None

build/torch27-cxx11-rocm63-x86_64-linux/activation/__init__.py CHANGED Viewed

@@ -39,7 +39,7 @@ def fused_add_rms_norm(
     weight: torch.Tensor,
     eps: float = 1e-6,
 ) -> None:
-    return FusedAddRMSNormFunction.apply(x, residual, weight, eps)[0]
 __all__ = [

     weight: torch.Tensor,
     eps: float = 1e-6,
 ) -> None:
+    return FusedAddRMSNormFunction.apply(x, residual, weight, eps)
 __all__ = [

build/torch27-cxx11-rocm63-x86_64-linux/activation/_activation_e5e2eeb_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c80d05690547f2842d416ebb85c9f830370373bc7e6c54ba08eec61b3690280f
+size 4386744

build/torch27-cxx11-rocm63-x86_64-linux/activation/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _activation_20250907180255
-ops = torch.ops._activation_20250907180255
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_20250907180255::{op_name}"

 import torch
+from . import _activation_e5e2eeb_dirty
+ops = torch.ops._activation_e5e2eeb_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_activation_e5e2eeb_dirty::{op_name}"

build/torch27-cxx11-rocm63-x86_64-linux/activation/layers.py CHANGED Viewed

@@ -85,7 +85,7 @@ class FusedAddRMSNorm(nn.Module):
         residual: torch.Tensor,
     ):
         return FusedAddRMSNormFunction.apply(x, residual, self.weight,
-                                             self.eps)[0]
     def reset_parameters(self) -> None:
         """

         residual: torch.Tensor,
     ):
         return FusedAddRMSNormFunction.apply(x, residual, self.weight,
+                                             self.eps)
     def reset_parameters(self) -> None:
         """

build/torch27-cxx11-rocm63-x86_64-linux/activation/rms_norm.py CHANGED Viewed

@@ -54,20 +54,14 @@ class FusedAddRMSNormFunction(torch.autograd.Function):
     def setup_context(ctx, inputs, outputs):
         _, _, weight, eps = inputs
         _, add_output = outputs
-        ctx.mark_non_differentiable(add_output)
-        ctx.set_materialize_grads(False)
         ctx.save_for_backward(weight, add_output)
         ctx.eps = eps
-    # This function only needs one gradient
     @staticmethod
-    def backward(ctx, output_grad, _):
         weight, add_output = ctx.saved_tensors
         eps = ctx.eps
-        if output_grad is None:
-            output_grad = torch.zeros_like(add_output)
         need_in = ctx.needs_input_grad[0]
         need_res = ctx.needs_input_grad[1]
@@ -76,7 +70,7 @@ class FusedAddRMSNormFunction(torch.autograd.Function):
         weight_grad = torch.empty_like(
             weight) if ctx.needs_input_grad[2] else None
-        ops.rms_norm_backward(grad, weight_grad, output_grad, add_output,
                               weight, eps)
         input_grad = grad if need_in else None
         residual_grad = grad if need_res else None

     def setup_context(ctx, inputs, outputs):
         _, _, weight, eps = inputs
         _, add_output = outputs
         ctx.save_for_backward(weight, add_output)
         ctx.eps = eps
     @staticmethod
+    def backward(ctx, output_grad, add_output_grad):
         weight, add_output = ctx.saved_tensors
         eps = ctx.eps
         need_in = ctx.needs_input_grad[0]
         need_res = ctx.needs_input_grad[1]
         weight_grad = torch.empty_like(
             weight) if ctx.needs_input_grad[2] else None
+        ops.fused_add_rms_norm_backward(grad, weight_grad, output_grad, add_output_grad, add_output,
                               weight, eps)
         input_grad = grad if need_in else None
         residual_grad = grad if need_res else None

build/torch28-cxx11-cu126-x86_64-linux/activation/__init__.py CHANGED Viewed

@@ -39,7 +39,7 @@ def fused_add_rms_norm(
     weight: torch.Tensor,
     eps: float = 1e-6,
 ) -> None:
-    return FusedAddRMSNormFunction.apply(x, residual, weight, eps)[0]
 __all__ = [

     weight: torch.Tensor,
     eps: float = 1e-6,
 ) -> None:
+    return FusedAddRMSNormFunction.apply(x, residual, weight, eps)
 __all__ = [

build/torch28-cxx11-cu126-x86_64-linux/activation/_activation_e5e2eeb_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:440f5c17a7ddaf73c506bbc84fd1405e2e188b8ceaf4977910608be6b91e89bf
+size 8730200

build/torch28-cxx11-cu126-x86_64-linux/activation/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _activation_20250907180255
-ops = torch.ops._activation_20250907180255
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_20250907180255::{op_name}"

 import torch
+from . import _activation_e5e2eeb_dirty
+ops = torch.ops._activation_e5e2eeb_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_activation_e5e2eeb_dirty::{op_name}"

build/torch28-cxx11-cu126-x86_64-linux/activation/layers.py CHANGED Viewed

@@ -85,7 +85,7 @@ class FusedAddRMSNorm(nn.Module):
         residual: torch.Tensor,
     ):
         return FusedAddRMSNormFunction.apply(x, residual, self.weight,
-                                             self.eps)[0]
     def reset_parameters(self) -> None:
         """

         residual: torch.Tensor,
     ):
         return FusedAddRMSNormFunction.apply(x, residual, self.weight,
+                                             self.eps)
     def reset_parameters(self) -> None:
         """

build/torch28-cxx11-cu126-x86_64-linux/activation/rms_norm.py CHANGED Viewed

@@ -54,20 +54,14 @@ class FusedAddRMSNormFunction(torch.autograd.Function):
     def setup_context(ctx, inputs, outputs):
         _, _, weight, eps = inputs
         _, add_output = outputs
-        ctx.mark_non_differentiable(add_output)
-        ctx.set_materialize_grads(False)
         ctx.save_for_backward(weight, add_output)
         ctx.eps = eps
-    # This function only needs one gradient
     @staticmethod
-    def backward(ctx, output_grad, _):
         weight, add_output = ctx.saved_tensors
         eps = ctx.eps
-        if output_grad is None:
-            output_grad = torch.zeros_like(add_output)
         need_in = ctx.needs_input_grad[0]
         need_res = ctx.needs_input_grad[1]
@@ -76,7 +70,7 @@ class FusedAddRMSNormFunction(torch.autograd.Function):
         weight_grad = torch.empty_like(
             weight) if ctx.needs_input_grad[2] else None
-        ops.rms_norm_backward(grad, weight_grad, output_grad, add_output,
                               weight, eps)
         input_grad = grad if need_in else None
         residual_grad = grad if need_res else None

     def setup_context(ctx, inputs, outputs):
         _, _, weight, eps = inputs
         _, add_output = outputs
         ctx.save_for_backward(weight, add_output)
         ctx.eps = eps
     @staticmethod
+    def backward(ctx, output_grad, add_output_grad):
         weight, add_output = ctx.saved_tensors
         eps = ctx.eps
         need_in = ctx.needs_input_grad[0]
         need_res = ctx.needs_input_grad[1]
         weight_grad = torch.empty_like(
             weight) if ctx.needs_input_grad[2] else None
+        ops.fused_add_rms_norm_backward(grad, weight_grad, output_grad, add_output_grad, add_output,
                               weight, eps)
         input_grad = grad if need_in else None
         residual_grad = grad if need_res else None

build/torch28-cxx11-cu128-x86_64-linux/activation/__init__.py CHANGED Viewed

@@ -39,7 +39,7 @@ def fused_add_rms_norm(
     weight: torch.Tensor,
     eps: float = 1e-6,
 ) -> None:
-    return FusedAddRMSNormFunction.apply(x, residual, weight, eps)[0]
 __all__ = [

     weight: torch.Tensor,
     eps: float = 1e-6,
 ) -> None:
+    return FusedAddRMSNormFunction.apply(x, residual, weight, eps)
 __all__ = [

build/torch28-cxx11-cu128-x86_64-linux/activation/_activation_e5e2eeb_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1dfb6d468f9cef0239d4ea47f0a247fa721befc5b8db86e1cddfc25f1814b67a
+size 13770064

build/torch28-cxx11-cu128-x86_64-linux/activation/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _activation_20250907180255
-ops = torch.ops._activation_20250907180255
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_20250907180255::{op_name}"

 import torch
+from . import _activation_e5e2eeb_dirty
+ops = torch.ops._activation_e5e2eeb_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_activation_e5e2eeb_dirty::{op_name}"

build/torch28-cxx11-cu128-x86_64-linux/activation/layers.py CHANGED Viewed

@@ -85,7 +85,7 @@ class FusedAddRMSNorm(nn.Module):
         residual: torch.Tensor,
     ):
         return FusedAddRMSNormFunction.apply(x, residual, self.weight,
-                                             self.eps)[0]
     def reset_parameters(self) -> None:
         """

         residual: torch.Tensor,
     ):
         return FusedAddRMSNormFunction.apply(x, residual, self.weight,
+                                             self.eps)
     def reset_parameters(self) -> None:
         """

build/torch28-cxx11-cu128-x86_64-linux/activation/rms_norm.py CHANGED Viewed

@@ -54,20 +54,14 @@ class FusedAddRMSNormFunction(torch.autograd.Function):
     def setup_context(ctx, inputs, outputs):
         _, _, weight, eps = inputs
         _, add_output = outputs
-        ctx.mark_non_differentiable(add_output)
-        ctx.set_materialize_grads(False)
         ctx.save_for_backward(weight, add_output)
         ctx.eps = eps
-    # This function only needs one gradient
     @staticmethod
-    def backward(ctx, output_grad, _):
         weight, add_output = ctx.saved_tensors
         eps = ctx.eps
-        if output_grad is None:
-            output_grad = torch.zeros_like(add_output)
         need_in = ctx.needs_input_grad[0]
         need_res = ctx.needs_input_grad[1]
@@ -76,7 +70,7 @@ class FusedAddRMSNormFunction(torch.autograd.Function):
         weight_grad = torch.empty_like(
             weight) if ctx.needs_input_grad[2] else None
-        ops.rms_norm_backward(grad, weight_grad, output_grad, add_output,
                               weight, eps)
         input_grad = grad if need_in else None
         residual_grad = grad if need_res else None

     def setup_context(ctx, inputs, outputs):
         _, _, weight, eps = inputs
         _, add_output = outputs
         ctx.save_for_backward(weight, add_output)
         ctx.eps = eps
     @staticmethod
+    def backward(ctx, output_grad, add_output_grad):
         weight, add_output = ctx.saved_tensors
         eps = ctx.eps
         need_in = ctx.needs_input_grad[0]
         need_res = ctx.needs_input_grad[1]
         weight_grad = torch.empty_like(
             weight) if ctx.needs_input_grad[2] else None
+        ops.fused_add_rms_norm_backward(grad, weight_grad, output_grad, add_output_grad, add_output,
                               weight, eps)
         input_grad = grad if need_in else None
         residual_grad = grad if need_res else None

build/torch28-cxx11-cu129-x86_64-linux/activation/__init__.py CHANGED Viewed

@@ -39,7 +39,7 @@ def fused_add_rms_norm(
     weight: torch.Tensor,
     eps: float = 1e-6,
 ) -> None:
-    return FusedAddRMSNormFunction.apply(x, residual, weight, eps)[0]
 __all__ = [

     weight: torch.Tensor,
     eps: float = 1e-6,
 ) -> None:
+    return FusedAddRMSNormFunction.apply(x, residual, weight, eps)
 __all__ = [

build/torch28-cxx11-cu129-x86_64-linux/activation/_activation_e5e2eeb_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0815a50e61497b357b2b90fc28602b3f53a25da1161edd2cb0b0fbebc7c62bf6
+size 13757248

build/torch28-cxx11-cu129-x86_64-linux/activation/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _activation_20250907180255
-ops = torch.ops._activation_20250907180255
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_20250907180255::{op_name}"

 import torch
+from . import _activation_e5e2eeb_dirty
+ops = torch.ops._activation_e5e2eeb_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_activation_e5e2eeb_dirty::{op_name}"

build/torch28-cxx11-cu129-x86_64-linux/activation/layers.py CHANGED Viewed

@@ -85,7 +85,7 @@ class FusedAddRMSNorm(nn.Module):
         residual: torch.Tensor,
     ):
         return FusedAddRMSNormFunction.apply(x, residual, self.weight,
-                                             self.eps)[0]
     def reset_parameters(self) -> None:
         """

         residual: torch.Tensor,
     ):
         return FusedAddRMSNormFunction.apply(x, residual, self.weight,
+                                             self.eps)
     def reset_parameters(self) -> None:
         """