feat(poly-norm): add perf test

Browse files

Files changed (10) hide show

.gitattributes +1 -0
README.md +14 -0
build/torch26-cxx11-rocm62-x86_64-linux/activation/{_activation_883cc1c_dirty.abi3.so → _activation_704692b_dirty.abi3.so} +1 -1
build/torch26-cxx11-rocm62-x86_64-linux/activation/_ops.py +3 -3
build/torch27-cxx11-rocm63-x86_64-linux/activation/{_activation_883cc1c_dirty.abi3.so → _activation_704692b_dirty.abi3.so} +1 -1
build/torch27-cxx11-rocm63-x86_64-linux/activation/_ops.py +3 -3
tests/conftest.py +144 -0
tests/kernels/test_perf.py +120 -0
tests/perf.png +3 -0
tests/perf_result.html +0 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 *.so filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 *.so filter=lfs diff=lfs merge=lfs -text
+tests/perf.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -24,3 +24,17 @@ x = torch.randn(10, 10)
 print(poly_norm(x))
 ```

 print(poly_norm(x))
 ```
+## Performance
+### PolyNorm
+- Test cases are from the Motif LLM
+- You can reproduce the results with:
+```bash
+cd tests
+pytest --run-perf --do-plot
+```
+![PolyNorm Performance](./tests/perf.png)

build/torch26-cxx11-rocm62-x86_64-linux/activation/{_activation_883cc1c_dirty.abi3.so → _activation_704692b_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a9d74188efdcb10158b338cf363749494f86e9712797722310f0a6ac5310efdd
 size 2401160

 version https://git-lfs.github.com/spec/v1
+oid sha256:417cf142fb8234b05f7e5b0be321d3a95ceafd7c0b3e5d3469579a52d78ddb1e
 size 2401160

build/torch26-cxx11-rocm62-x86_64-linux/activation/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _activation_883cc1c_dirty
-ops = torch.ops._activation_883cc1c_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_883cc1c_dirty::{op_name}"

 import torch
+from . import _activation_704692b_dirty
+ops = torch.ops._activation_704692b_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_activation_704692b_dirty::{op_name}"

build/torch27-cxx11-rocm63-x86_64-linux/activation/{_activation_883cc1c_dirty.abi3.so → _activation_704692b_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:719fc6521c0824b253cb11ea9e564ef7835e2102e5bc6399cfdb69203d6d5c26
 size 2395176

 version https://git-lfs.github.com/spec/v1
+oid sha256:6fe6163d88e95c0d6847b3fe993cd80de677f89cfde7fc4d5c3ec2d0d96c9de8
 size 2395176

build/torch27-cxx11-rocm63-x86_64-linux/activation/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _activation_883cc1c_dirty
-ops = torch.ops._activation_883cc1c_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_883cc1c_dirty::{op_name}"

 import torch
+from . import _activation_704692b_dirty
+ops = torch.ops._activation_704692b_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_activation_704692b_dirty::{op_name}"

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import logging
+import numpy as np
+import plotly.graph_objects as go
+import pytest
+from .kernels.test_perf import PERF_RESULTS, PerfResult
+logger = logging.getLogger(__name__)
+DO_PLOT = False
+def plot(perf_results: list[PerfResult]):
+    x_labels = [f"{r.type}, {r.shape}, {r.dtype}" for r in perf_results]
+    kernel_speedup = [r.speedup for r in perf_results]
+    torch_speedup = [1 for _ in perf_results]
+    geo_mean = float(np.exp(np.mean(np.log(kernel_speedup))))
+    x_labels.append("Geometric Mean")
+    kernel_speedup.append(geo_mean)
+    torch_speedup.append(1.0)
+    fig = go.Figure()
+    bar_width = 0.2
+    fig.add_trace(
+        go.Bar(
+            x=x_labels,
+            y=kernel_speedup,
+            name="Activation",
+            marker_color="rgb(100, 100, 100)",
+            text=[f"x{v:.2f}" for v in kernel_speedup],
+            textfont=dict(size=14),
+            textposition="outside",
+            # width=[bar_width] * len(x_labels),
+        )
+    )
+    fig.add_trace(
+        go.Bar(
+            x=x_labels,
+            y=torch_speedup,
+            name="Torch",
+            marker_color="rgb(30, 30, 30)",
+            text=[f"x{v:.2f}" for v in torch_speedup],
+            textfont=dict(size=14),
+            textposition="outside",
+            # width=[bar_width] * len(x_labels),
+        )
+    )
+    fig.update_layout(
+        title=dict(
+            text="<b>Speedup over torch (higher is better) (MI250, torch 2.7, ROCm 6.3)</b>",
+            font=dict(size=24),
+        ),
+        legend=dict(
+            x=0.01,
+            y=0.99,
+            xanchor="left",
+            yanchor="top",
+            bgcolor="rgba(0,0,0,0)",
+            bordercolor="black",
+            borderwidth=1,
+        ),
+        font=dict(size=16),
+        yaxis_title="Speedup (torch / activation)",
+        barmode="group",
+        bargroupgap=0,
+        bargap=0.2,
+        xaxis_tickangle=-45,
+        template="plotly_white",
+        yaxis_type="log",
+        shapes=[
+            dict(
+                type="rect",
+                xref="x",
+                yref="paper",  # y축 전체 범위 (0~1)
+                x0=-0.5,
+                x1=len(x_labels) - 0.5,
+                y0=0,
+                y1=1,
+                line=dict(
+                    color="black",
+                    width=1.5,
+                ),
+                fillcolor="rgba(0,0,0,0)",  # 투명 배경
+                layer="above",  # bar 아래에 그리기
+            )
+        ],
+    )
+    output_file = "perf_result.html"
+    fig.write_html(output_file)
+    logger.info(f"Plotting performance results to {output_file}")
+def pytest_addoption(parser):
+    parser.addoption(
+        "--run-perf", action="store_true", default=False, help="Run perf tests"
+    )
+    parser.addoption(
+        "--do-plot", action="store_true", default=False, help="Plot performance results"
+    )
+@pytest.fixture
+def do_plot(request):
+    return request.config.getoption("--do-plot")
+def pytest_configure(config):
+    global DO_PLOT
+    DO_PLOT = config.getoption("--do-plot")
+    run_perf = config.getoption("--run-perf")
+    if DO_PLOT and not run_perf:
+        raise ValueError(
+            "Cannot plot performance results without running performance tests. "
+            "Please use --run-perf option."
+        )
+    config.addinivalue_line("markers", "perf: mark test as performance-related")
+def pytest_collection_modifyitems(config, items):
+    run_perf = config.getoption("--run-perf")
+    skip_perf = pytest.mark.skip(reason="need --run-perf option to run")
+    skip_normal = pytest.mark.skip(
+        reason="normal tests skipped when --run-perf is used"
+    )
+    for item in items:
+        if "perf" in item.keywords and not run_perf:
+            item.add_marker(skip_perf)
+        elif "perf" not in item.keywords and run_perf:
+            item.add_marker(skip_normal)
+def pytest_sessionfinish(session, exitstatus) -> None:
+    if DO_PLOT:
+        plot(PERF_RESULTS)
+    else:
+        logger.info(PERF_RESULTS)

tests/kernels/test_perf.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import random
+from dataclasses import dataclass
+import pytest
+import torch
+import activation
+from .test_activation import poly_norm
+from .utils import assert_close
+CASES = [
+    ((1, 2048, 8192), torch.bfloat16),
+    ((1, 2048, 16384), torch.bfloat16),
+    ((1, 16384, 8192), torch.bfloat16),
+    ((1, 16384, 16384), torch.bfloat16),
+]
+NUM_REP = 100
+@dataclass
+class PerfResult:
+    type: str  # forward or backward
+    shape: tuple
+    dtype: torch.dtype
+    kernel_time_ms: float
+    torch_time_ms: float
+    @property
+    def speedup(self) -> float:
+        return self.torch_time_ms / self.kernel_time_ms
+PERF_RESULTS: list[PerfResult] = []
+@pytest.mark.parametrize("cases", CASES)
+@pytest.mark.perf
+def test_poly_norm(
+    cases: tuple,
+    do_plot: bool,
+) -> None:
+    random.seed(12345)
+    torch.manual_seed(12345)
+    torch.set_default_device("cuda")
+    shape, dtype = cases
+    x = torch.randn(shape, dtype=dtype, requires_grad=True)
+    weight = torch.randn(3, dtype=dtype, requires_grad=True)
+    bias = torch.randn(1, dtype=dtype, requires_grad=True)
+    eps = 1e-05
+    x.retain_grad()
+    weight.retain_grad()
+    bias.retain_grad()
+    # To separate gradient computation, clone the inputs
+    x_ref = x.detach().clone().requires_grad_(True)
+    weight_ref = weight.detach().clone().requires_grad_(True)
+    bias_ref = bias.detach().clone().requires_grad_(True)
+    torch_fn = poly_norm
+    layer = activation.layers.PolyNorm(eps)
+    layer.weight = torch.nn.Parameter(weight)
+    layer.bias = torch.nn.Parameter(bias)
+    # Check correctness
+    mod_out = layer(x)
+    ref_out = torch_fn(x_ref, weight_ref, bias_ref, eps)
+    assert_close(mod_out, ref_out)
+    out_grad = torch.rand_like(ref_out)
+    out_grad = out_grad / out_grad.norm()
+    ref_out.backward(out_grad, retain_graph=True)
+    mod_out.backward(out_grad, retain_graph=True)
+    assert_close(x.grad, x_ref.grad)
+    assert_close(layer.bias.grad, bias_ref.grad, rtol=0.05)
+    assert_close(layer.weight.grad, weight_ref.grad, rtol=0.05)
+    def time_cuda(fn):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        for _ in range(5):
+            fn()
+        start.record()
+        for _ in range(NUM_REP):
+            fn()
+        end.record()
+        torch.cuda.synchronize()
+        return start.elapsed_time(end) / NUM_REP
+    kernel_time_ms = time_cuda(lambda: layer(x))
+    torch_fn_time = time_cuda(lambda: torch_fn(x_ref, weight_ref, bias_ref, eps))
+    PERF_RESULTS.append(
+        PerfResult(
+            type="forward",
+            shape=shape,
+            dtype=dtype,
+            kernel_time_ms=kernel_time_ms,
+            torch_time_ms=torch_fn_time,
+        )
+    )
+    kernel_time_ms = time_cuda(lambda: mod_out.backward(out_grad, retain_graph=True))
+    torch_fn_time = time_cuda(lambda: ref_out.backward(out_grad, retain_graph=True))
+    PERF_RESULTS.append(
+        PerfResult(
+            type="backward",
+            shape=shape,
+            dtype=dtype,
+            kernel_time_ms=kernel_time_ms,
+            torch_time_ms=torch_fn_time,
+        )
+    )

tests/perf.png ADDED Viewed

Git LFS Details

SHA256: 12f88f9ac4511cb37f38a34e3572e4347bd0c857144a4aaf64bd5981d6b50877
Pointer size: 131 Bytes
Size of remote file: 166 kB

tests/perf_result.html ADDED Viewed

The diff for this file is too large to render. See raw diff