danieldk HF Staff commited on Apr 8

Commit

e84674f

1 Parent(s): 2218ad7

Build (fp8)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

build/torch25-cxx11-cu118-x86_64-linux/moe/{_moe_21a4db0.abi3.so → _moe_2218ad7.abi3.so} +2 -2
build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py +3 -3
build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py +24 -0
build/torch25-cxx11-cu118-x86_64-linux/moe/layers.py +10 -0
build/torch25-cxx11-cu121-x86_64-linux/moe/{_moe_21a4db0.abi3.so → _moe_2218ad7.abi3.so} +1 -1
build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py +3 -3
build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py +24 -0
build/torch25-cxx11-cu121-x86_64-linux/moe/layers.py +10 -0
build/torch25-cxx11-cu124-x86_64-linux/moe/{_moe_21a4db0.abi3.so → _moe_2218ad7.abi3.so} +1 -1
build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py +3 -3
build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py +24 -0
build/torch25-cxx11-cu124-x86_64-linux/moe/layers.py +10 -0
build/{torch26-cxx98-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so → torch25-cxx98-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so} +1 -1
build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py +3 -3
build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py +24 -0
build/torch25-cxx98-cu118-x86_64-linux/moe/layers.py +10 -0
build/torch25-cxx98-cu121-x86_64-linux/moe/{_moe_21a4db0.abi3.so → _moe_2218ad7.abi3.so} +1 -1
build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py +3 -3
build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py +24 -0
build/torch25-cxx98-cu121-x86_64-linux/moe/layers.py +10 -0
build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so +0 -3
build/{torch25-cxx98-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so → torch25-cxx98-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so} +2 -2
build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py +3 -3
build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py +24 -0
build/torch25-cxx98-cu124-x86_64-linux/moe/layers.py +10 -0
build/torch26-cxx11-cu118-x86_64-linux/moe/{_moe_21a4db0.abi3.so → _moe_2218ad7.abi3.so} +1 -1
build/torch26-cxx11-cu118-x86_64-linux/moe/_ops.py +3 -3
build/torch26-cxx11-cu118-x86_64-linux/moe/fused_moe.py +24 -0
build/torch26-cxx11-cu118-x86_64-linux/moe/layers.py +10 -0
build/torch26-cxx11-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so +0 -3
build/torch26-cxx11-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so +3 -0
build/torch26-cxx11-cu124-x86_64-linux/moe/_ops.py +3 -3
build/torch26-cxx11-cu124-x86_64-linux/moe/fused_moe.py +24 -0
build/torch26-cxx11-cu124-x86_64-linux/moe/layers.py +10 -0
build/torch26-cxx11-cu126-x86_64-linux/moe/_moe_21a4db0.abi3.so +0 -3
build/torch26-cxx11-cu126-x86_64-linux/moe/_moe_2218ad7.abi3.so +3 -0
build/torch26-cxx11-cu126-x86_64-linux/moe/_ops.py +3 -3
build/torch26-cxx11-cu126-x86_64-linux/moe/fused_moe.py +24 -0
build/torch26-cxx11-cu126-x86_64-linux/moe/layers.py +10 -0
build/torch26-cxx98-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so +3 -0
build/torch26-cxx98-cu118-x86_64-linux/moe/_ops.py +3 -3
build/torch26-cxx98-cu118-x86_64-linux/moe/fused_moe.py +24 -0
build/torch26-cxx98-cu118-x86_64-linux/moe/layers.py +10 -0
build/torch26-cxx98-cu124-x86_64-linux/moe/{_moe_21a4db0.abi3.so → _moe_2218ad7.abi3.so} +1 -1
build/torch26-cxx98-cu124-x86_64-linux/moe/_ops.py +3 -3
build/torch26-cxx98-cu124-x86_64-linux/moe/fused_moe.py +24 -0
build/torch26-cxx98-cu124-x86_64-linux/moe/layers.py +10 -0
build/torch26-cxx98-cu126-x86_64-linux/moe/{_moe_21a4db0.abi3.so → _moe_2218ad7.abi3.so} +1 -1
build/torch26-cxx98-cu126-x86_64-linux/moe/_ops.py +3 -3
build/torch26-cxx98-cu126-x86_64-linux/moe/fused_moe.py +24 -0

build/torch25-cxx11-cu118-x86_64-linux/moe/{_moe_21a4db0.abi3.so → _moe_2218ad7.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:03298359c23e496df84a18978298d3372423f7733fb8185b2f6a535d25a64a7e
-size 87060424

 version https://git-lfs.github.com/spec/v1
+oid sha256:49dc6c1d936b3dc6c483a4ef5d581c5d2f08f50f6ea2ffcdbfecdf0b719c3410
+size 87056328

build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _moe_21a4db0
-ops = torch.ops._moe_21a4db0
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_21a4db0::{op_name}"

 import torch
+from . import _moe_2218ad7
+ops = torch.ops._moe_2218ad7
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_moe_2218ad7::{op_name}"

build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py CHANGED Viewed

@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
 )
 @triton.jit
 def write_zeros_to_output(
     c_ptr,

 )
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+def _fp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    block_shape: Optional[List[int]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform fp8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+    if block_shape is None:
+        A, A_scale = scaled_fp8_quant(A, A_scale)
+    else:
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_fp8(A, block_k)
+        assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+    return A, A_scale
 @triton.jit
 def write_zeros_to_output(
     c_ptr,

build/torch25-cxx11-cu118-x86_64-linux/moe/layers.py CHANGED Viewed

@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
         _fix_llama4_experts(hidden_states, self.experts)
         router_logits = self.router(hidden_states)
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
         )
         out += self.shared_expert(hidden_states)

         _fix_llama4_experts(hidden_states, self.experts)
         router_logits = self.router(hidden_states)
+        extra_kwargs = {}
+        use_fp8_w8a8 = False
+        if hasattr(self.experts, "gate_up_proj_scale"):
+            use_fp8_w8a8 = True
+            extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
+            extra_kwargs["w2_scale"] = self.experts.down_proj_scale
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
+            use_fp8_w8a8=use_fp8_w8a8,
+            **extra_kwargs
         )
         out += self.shared_expert(hidden_states)

build/torch25-cxx11-cu121-x86_64-linux/moe/{_moe_21a4db0.abi3.so → _moe_2218ad7.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:459eff1ecfb7a0b08c7733cf7d87073d44c1cc2c123db5437fc2d96699968b24
 size 87254968

 version https://git-lfs.github.com/spec/v1
+oid sha256:12bb26a0a9a47039bbcbf2c5fda7c068211cb711827b0e0e0d98b2fe99ed3b54
 size 87254968

build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _moe_21a4db0
-ops = torch.ops._moe_21a4db0
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_21a4db0::{op_name}"

 import torch
+from . import _moe_2218ad7
+ops = torch.ops._moe_2218ad7
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_moe_2218ad7::{op_name}"

build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py CHANGED Viewed

@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
 )
 @triton.jit
 def write_zeros_to_output(
     c_ptr,

 )
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+def _fp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    block_shape: Optional[List[int]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform fp8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+    if block_shape is None:
+        A, A_scale = scaled_fp8_quant(A, A_scale)
+    else:
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_fp8(A, block_k)
+        assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+    return A, A_scale
 @triton.jit
 def write_zeros_to_output(
     c_ptr,

build/torch25-cxx11-cu121-x86_64-linux/moe/layers.py CHANGED Viewed

@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
         _fix_llama4_experts(hidden_states, self.experts)
         router_logits = self.router(hidden_states)
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
         )
         out += self.shared_expert(hidden_states)

         _fix_llama4_experts(hidden_states, self.experts)
         router_logits = self.router(hidden_states)
+        extra_kwargs = {}
+        use_fp8_w8a8 = False
+        if hasattr(self.experts, "gate_up_proj_scale"):
+            use_fp8_w8a8 = True
+            extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
+            extra_kwargs["w2_scale"] = self.experts.down_proj_scale
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
+            use_fp8_w8a8=use_fp8_w8a8,
+            **extra_kwargs
         )
         out += self.shared_expert(hidden_states)

build/torch25-cxx11-cu124-x86_64-linux/moe/{_moe_21a4db0.abi3.so → _moe_2218ad7.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ab56f79975737060a41592e8be9f56d92e599b04d3cb9a95c8f515afb341b6a2
 size 86965608

 version https://git-lfs.github.com/spec/v1
+oid sha256:ca9a24c28dab4109a13549ee7ce379b36d950930b8bd106669188262863f3795
 size 86965608

build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _moe_21a4db0
-ops = torch.ops._moe_21a4db0
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_21a4db0::{op_name}"

 import torch
+from . import _moe_2218ad7
+ops = torch.ops._moe_2218ad7
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_moe_2218ad7::{op_name}"

build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py CHANGED Viewed

@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
 )
 @triton.jit
 def write_zeros_to_output(
     c_ptr,

 )
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+def _fp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    block_shape: Optional[List[int]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform fp8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+    if block_shape is None:
+        A, A_scale = scaled_fp8_quant(A, A_scale)
+    else:
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_fp8(A, block_k)
+        assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+    return A, A_scale
 @triton.jit
 def write_zeros_to_output(
     c_ptr,

build/torch25-cxx11-cu124-x86_64-linux/moe/layers.py CHANGED Viewed

@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
         _fix_llama4_experts(hidden_states, self.experts)
         router_logits = self.router(hidden_states)
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
         )
         out += self.shared_expert(hidden_states)

         _fix_llama4_experts(hidden_states, self.experts)
         router_logits = self.router(hidden_states)
+        extra_kwargs = {}
+        use_fp8_w8a8 = False
+        if hasattr(self.experts, "gate_up_proj_scale"):
+            use_fp8_w8a8 = True
+            extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
+            extra_kwargs["w2_scale"] = self.experts.down_proj_scale
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
+            use_fp8_w8a8=use_fp8_w8a8,
+            **extra_kwargs
         )
         out += self.shared_expert(hidden_states)

build/{torch26-cxx98-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so → torch25-cxx98-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5e0d9837b95dde6e7286a15e312db47fbc302850ca6ea5610fb306ba42bfbab7
 size 87048408

 version https://git-lfs.github.com/spec/v1
+oid sha256:d65d3a08c44b65a44d2c58566aa7e26e85d0d949be71096e09f7ad73d0b5e040
 size 87048408

build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _moe_21a4db0
-ops = torch.ops._moe_21a4db0
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_21a4db0::{op_name}"

 import torch
+from . import _moe_2218ad7
+ops = torch.ops._moe_2218ad7
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_moe_2218ad7::{op_name}"

build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py CHANGED Viewed

@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
 )
 @triton.jit
 def write_zeros_to_output(
     c_ptr,

 )
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+def _fp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    block_shape: Optional[List[int]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform fp8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+    if block_shape is None:
+        A, A_scale = scaled_fp8_quant(A, A_scale)
+    else:
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_fp8(A, block_k)
+        assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+    return A, A_scale
 @triton.jit
 def write_zeros_to_output(
     c_ptr,

build/torch25-cxx98-cu118-x86_64-linux/moe/layers.py CHANGED Viewed

@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
         _fix_llama4_experts(hidden_states, self.experts)
         router_logits = self.router(hidden_states)
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
         )
         out += self.shared_expert(hidden_states)

         _fix_llama4_experts(hidden_states, self.experts)
         router_logits = self.router(hidden_states)
+        extra_kwargs = {}
+        use_fp8_w8a8 = False
+        if hasattr(self.experts, "gate_up_proj_scale"):
+            use_fp8_w8a8 = True
+            extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
+            extra_kwargs["w2_scale"] = self.experts.down_proj_scale
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
+            use_fp8_w8a8=use_fp8_w8a8,
+            **extra_kwargs
         )
         out += self.shared_expert(hidden_states)

build/torch25-cxx98-cu121-x86_64-linux/moe/{_moe_21a4db0.abi3.so → _moe_2218ad7.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f616a904eda2f4a23a23556ef522449f9d1b111ea5a9c215d7d04c3ccb9345fe
 size 87243240

 version https://git-lfs.github.com/spec/v1
+oid sha256:d2d4157287a3e7979780f23a709eba01e787186bc32a5e56c0620b5429e9cfd3
 size 87243240

build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _moe_21a4db0
-ops = torch.ops._moe_21a4db0
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_21a4db0::{op_name}"

 import torch
+from . import _moe_2218ad7
+ops = torch.ops._moe_2218ad7
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_moe_2218ad7::{op_name}"

build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py CHANGED Viewed

@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
 )
 @triton.jit
 def write_zeros_to_output(
     c_ptr,

 )
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+def _fp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    block_shape: Optional[List[int]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform fp8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+    if block_shape is None:
+        A, A_scale = scaled_fp8_quant(A, A_scale)
+    else:
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_fp8(A, block_k)
+        assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+    return A, A_scale
 @triton.jit
 def write_zeros_to_output(
     c_ptr,

build/torch25-cxx98-cu121-x86_64-linux/moe/layers.py CHANGED Viewed

@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
         _fix_llama4_experts(hidden_states, self.experts)
         router_logits = self.router(hidden_states)
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
         )
         out += self.shared_expert(hidden_states)

         _fix_llama4_experts(hidden_states, self.experts)
         router_logits = self.router(hidden_states)
+        extra_kwargs = {}
+        use_fp8_w8a8 = False
+        if hasattr(self.experts, "gate_up_proj_scale"):
+            use_fp8_w8a8 = True
+            extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
+            extra_kwargs["w2_scale"] = self.experts.down_proj_scale
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
+            use_fp8_w8a8=use_fp8_w8a8,
+            **extra_kwargs
         )
         out += self.shared_expert(hidden_states)

build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:820b62662956741ae78d7c51fb9fc978ff2e86c7dc1efa1335b0701e0e28749a
-size 86957976

build/{torch25-cxx98-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so → torch25-cxx98-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:af44b6b8253f8b388158eda88cd12a59bec61aede5702ca684da10096e2708cb
-size 87052528

 version https://git-lfs.github.com/spec/v1
+oid sha256:122544181246b179a772eb07c9e01c8df6b3025c20b333c566d0e84bfd7bea2d
+size 86953880

build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _moe_21a4db0
-ops = torch.ops._moe_21a4db0
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_21a4db0::{op_name}"

 import torch
+from . import _moe_2218ad7
+ops = torch.ops._moe_2218ad7
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_moe_2218ad7::{op_name}"

build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py CHANGED Viewed

@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
 )
 @triton.jit
 def write_zeros_to_output(
     c_ptr,

 )
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+def _fp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    block_shape: Optional[List[int]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform fp8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+    if block_shape is None:
+        A, A_scale = scaled_fp8_quant(A, A_scale)
+    else:
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_fp8(A, block_k)
+        assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+    return A, A_scale
 @triton.jit
 def write_zeros_to_output(
     c_ptr,

build/torch25-cxx98-cu124-x86_64-linux/moe/layers.py CHANGED Viewed

@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
         _fix_llama4_experts(hidden_states, self.experts)
         router_logits = self.router(hidden_states)
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
         )
         out += self.shared_expert(hidden_states)

         _fix_llama4_experts(hidden_states, self.experts)
         router_logits = self.router(hidden_states)
+        extra_kwargs = {}
+        use_fp8_w8a8 = False
+        if hasattr(self.experts, "gate_up_proj_scale"):
+            use_fp8_w8a8 = True
+            extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
+            extra_kwargs["w2_scale"] = self.experts.down_proj_scale
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
+            use_fp8_w8a8=use_fp8_w8a8,
+            **extra_kwargs
         )
         out += self.shared_expert(hidden_states)

build/torch26-cxx11-cu118-x86_64-linux/moe/{_moe_21a4db0.abi3.so → _moe_2218ad7.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c9916c6b52d9381c94b09b78f4f6756895e51563c0049f462363937f40c0114f
 size 87060352

 version https://git-lfs.github.com/spec/v1
+oid sha256:49e17eb28438bddf98e314893cf262b807d64ee03850b46abe4d0bf6151f62b6
 size 87060352

build/torch26-cxx11-cu118-x86_64-linux/moe/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _moe_21a4db0
-ops = torch.ops._moe_21a4db0
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_21a4db0::{op_name}"

 import torch
+from . import _moe_2218ad7
+ops = torch.ops._moe_2218ad7
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_moe_2218ad7::{op_name}"

build/torch26-cxx11-cu118-x86_64-linux/moe/fused_moe.py CHANGED Viewed

@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
 )
 @triton.jit
 def write_zeros_to_output(
     c_ptr,

 )
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+def _fp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    block_shape: Optional[List[int]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform fp8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+    if block_shape is None:
+        A, A_scale = scaled_fp8_quant(A, A_scale)
+    else:
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_fp8(A, block_k)
+        assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+    return A, A_scale
 @triton.jit
 def write_zeros_to_output(
     c_ptr,

build/torch26-cxx11-cu118-x86_64-linux/moe/layers.py CHANGED Viewed

@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
         _fix_llama4_experts(hidden_states, self.experts)
         router_logits = self.router(hidden_states)
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
         )
         out += self.shared_expert(hidden_states)

         _fix_llama4_experts(hidden_states, self.experts)
         router_logits = self.router(hidden_states)
+        extra_kwargs = {}
+        use_fp8_w8a8 = False
+        if hasattr(self.experts, "gate_up_proj_scale"):
+            use_fp8_w8a8 = True
+            extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
+            extra_kwargs["w2_scale"] = self.experts.down_proj_scale
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
+            use_fp8_w8a8=use_fp8_w8a8,
+            **extra_kwargs
         )
         out += self.shared_expert(hidden_states)

build/torch26-cxx11-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9a9acc9198a56410e1d6bddec3a4529fb14b12843f6589b4477bc4ee795f7278
-size 86961568

build/torch26-cxx11-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f804164f561c9b46f3b997a6d13552ca4d704c43484b5cd8d14682b4450ed472
+size 86965664

build/torch26-cxx11-cu124-x86_64-linux/moe/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _moe_21a4db0
-ops = torch.ops._moe_21a4db0
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_21a4db0::{op_name}"

 import torch
+from . import _moe_2218ad7
+ops = torch.ops._moe_2218ad7
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_moe_2218ad7::{op_name}"

build/torch26-cxx11-cu124-x86_64-linux/moe/fused_moe.py CHANGED Viewed

@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
 )
 @triton.jit
 def write_zeros_to_output(
     c_ptr,

 )
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+def _fp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    block_shape: Optional[List[int]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform fp8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+    if block_shape is None:
+        A, A_scale = scaled_fp8_quant(A, A_scale)
+    else:
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_fp8(A, block_k)
+        assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+    return A, A_scale
 @triton.jit
 def write_zeros_to_output(
     c_ptr,

build/torch26-cxx11-cu124-x86_64-linux/moe/layers.py CHANGED Viewed

@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
         _fix_llama4_experts(hidden_states, self.experts)
         router_logits = self.router(hidden_states)
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
         )
         out += self.shared_expert(hidden_states)

         _fix_llama4_experts(hidden_states, self.experts)
         router_logits = self.router(hidden_states)
+        extra_kwargs = {}
+        use_fp8_w8a8 = False
+        if hasattr(self.experts, "gate_up_proj_scale"):
+            use_fp8_w8a8 = True
+            extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
+            extra_kwargs["w2_scale"] = self.experts.down_proj_scale
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
+            use_fp8_w8a8=use_fp8_w8a8,
+            **extra_kwargs
         )
         out += self.shared_expert(hidden_states)

build/torch26-cxx11-cu126-x86_64-linux/moe/_moe_21a4db0.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:eb26fad3cfe2db1cc88637e020d6d8ddbc54df3e7e8edd64ba9370cd96177587
-size 87428864

build/torch26-cxx11-cu126-x86_64-linux/moe/_moe_2218ad7.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1102bf615b2d2f7c320ac73eed63b982e969683ac72c958080dddb87166fa595
+size 87432960

build/torch26-cxx11-cu126-x86_64-linux/moe/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _moe_21a4db0
-ops = torch.ops._moe_21a4db0
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_21a4db0::{op_name}"

 import torch
+from . import _moe_2218ad7
+ops = torch.ops._moe_2218ad7
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_moe_2218ad7::{op_name}"

build/torch26-cxx11-cu126-x86_64-linux/moe/fused_moe.py CHANGED Viewed

@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
 )
 @triton.jit
 def write_zeros_to_output(
     c_ptr,

 )
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+def _fp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    block_shape: Optional[List[int]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform fp8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+    if block_shape is None:
+        A, A_scale = scaled_fp8_quant(A, A_scale)
+    else:
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_fp8(A, block_k)
+        assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+    return A, A_scale
 @triton.jit
 def write_zeros_to_output(
     c_ptr,

build/torch26-cxx11-cu126-x86_64-linux/moe/layers.py CHANGED Viewed

@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
         _fix_llama4_experts(hidden_states, self.experts)
         router_logits = self.router(hidden_states)
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
         )
         out += self.shared_expert(hidden_states)

         _fix_llama4_experts(hidden_states, self.experts)
         router_logits = self.router(hidden_states)
+        extra_kwargs = {}
+        use_fp8_w8a8 = False
+        if hasattr(self.experts, "gate_up_proj_scale"):
+            use_fp8_w8a8 = True
+            extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
+            extra_kwargs["w2_scale"] = self.experts.down_proj_scale
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
+            use_fp8_w8a8=use_fp8_w8a8,
+            **extra_kwargs
         )
         out += self.shared_expert(hidden_states)

build/torch26-cxx98-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e739bb546d3d1730fa7696fbd767fd588286dec369f1b7551edd1ec481df96f
+size 87044288

build/torch26-cxx98-cu118-x86_64-linux/moe/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _moe_21a4db0
-ops = torch.ops._moe_21a4db0
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_21a4db0::{op_name}"

 import torch
+from . import _moe_2218ad7
+ops = torch.ops._moe_2218ad7
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_moe_2218ad7::{op_name}"

build/torch26-cxx98-cu118-x86_64-linux/moe/fused_moe.py CHANGED Viewed

@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
 )
 @triton.jit
 def write_zeros_to_output(
     c_ptr,

 )
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+def _fp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    block_shape: Optional[List[int]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform fp8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+    if block_shape is None:
+        A, A_scale = scaled_fp8_quant(A, A_scale)
+    else:
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_fp8(A, block_k)
+        assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+    return A, A_scale
 @triton.jit
 def write_zeros_to_output(
     c_ptr,

build/torch26-cxx98-cu118-x86_64-linux/moe/layers.py CHANGED Viewed

@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
         _fix_llama4_experts(hidden_states, self.experts)
         router_logits = self.router(hidden_states)
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
         )
         out += self.shared_expert(hidden_states)

         _fix_llama4_experts(hidden_states, self.experts)
         router_logits = self.router(hidden_states)
+        extra_kwargs = {}
+        use_fp8_w8a8 = False
+        if hasattr(self.experts, "gate_up_proj_scale"):
+            use_fp8_w8a8 = True
+            extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
+            extra_kwargs["w2_scale"] = self.experts.down_proj_scale
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
+            use_fp8_w8a8=use_fp8_w8a8,
+            **extra_kwargs
         )
         out += self.shared_expert(hidden_states)

build/torch26-cxx98-cu124-x86_64-linux/moe/{_moe_21a4db0.abi3.so → _moe_2218ad7.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f7abba352c56f674e909a701b965b1508babdf69e9a6dda54fac6f11088d0ac2
 size 86953856

 version https://git-lfs.github.com/spec/v1
+oid sha256:bcb950d2e7196ad22cad926749b7e0e06e5454f0a732755b72f0b8dd456529c6
 size 86953856

build/torch26-cxx98-cu124-x86_64-linux/moe/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _moe_21a4db0
-ops = torch.ops._moe_21a4db0
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_21a4db0::{op_name}"

 import torch
+from . import _moe_2218ad7
+ops = torch.ops._moe_2218ad7
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_moe_2218ad7::{op_name}"

build/torch26-cxx98-cu124-x86_64-linux/moe/fused_moe.py CHANGED Viewed

@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
 )
 @triton.jit
 def write_zeros_to_output(
     c_ptr,

 )
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+def _fp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    block_shape: Optional[List[int]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform fp8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+    if block_shape is None:
+        A, A_scale = scaled_fp8_quant(A, A_scale)
+    else:
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_fp8(A, block_k)
+        assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+    return A, A_scale
 @triton.jit
 def write_zeros_to_output(
     c_ptr,

build/torch26-cxx98-cu124-x86_64-linux/moe/layers.py CHANGED Viewed

@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
         _fix_llama4_experts(hidden_states, self.experts)
         router_logits = self.router(hidden_states)
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
         )
         out += self.shared_expert(hidden_states)

         _fix_llama4_experts(hidden_states, self.experts)
         router_logits = self.router(hidden_states)
+        extra_kwargs = {}
+        use_fp8_w8a8 = False
+        if hasattr(self.experts, "gate_up_proj_scale"):
+            use_fp8_w8a8 = True
+            extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
+            extra_kwargs["w2_scale"] = self.experts.down_proj_scale
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
+            use_fp8_w8a8=use_fp8_w8a8,
+            **extra_kwargs
         )
         out += self.shared_expert(hidden_states)

build/torch26-cxx98-cu126-x86_64-linux/moe/{_moe_21a4db0.abi3.so → _moe_2218ad7.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6568844b9365cdbe7fbdddfda668e49f7c780028988c5bae2d48a72eeba1650b
 size 87417064

 version https://git-lfs.github.com/spec/v1
+oid sha256:fe5c605f1da902aebc1d7ce0355b649fcfcc44aed0023fdc87974f3d56273897
 size 87417064

build/torch26-cxx98-cu126-x86_64-linux/moe/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _moe_21a4db0
-ops = torch.ops._moe_21a4db0
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_21a4db0::{op_name}"

 import torch
+from . import _moe_2218ad7
+ops = torch.ops._moe_2218ad7
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_moe_2218ad7::{op_name}"

build/torch26-cxx98-cu126-x86_64-linux/moe/fused_moe.py CHANGED Viewed

@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
 )
 @triton.jit
 def write_zeros_to_output(
     c_ptr,

 )
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+def _fp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    block_shape: Optional[List[int]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform fp8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+    if block_shape is None:
+        A, A_scale = scaled_fp8_quant(A, A_scale)
+    else:
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_fp8(A, block_k)
+        assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+    return A, A_scale
 @triton.jit
 def write_zeros_to_output(
     c_ptr,