kernel
danieldk HF Staff commited on
Commit
e84674f
Β·
1 Parent(s): 2218ad7

Build (fp8)

Browse files
This view is limited to 50 files because it contains too many changes. Β  See raw diff
Files changed (50) hide show
  1. build/torch25-cxx11-cu118-x86_64-linux/moe/{_moe_21a4db0.abi3.so β†’ _moe_2218ad7.abi3.so} +2 -2
  2. build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py +3 -3
  3. build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py +24 -0
  4. build/torch25-cxx11-cu118-x86_64-linux/moe/layers.py +10 -0
  5. build/torch25-cxx11-cu121-x86_64-linux/moe/{_moe_21a4db0.abi3.so β†’ _moe_2218ad7.abi3.so} +1 -1
  6. build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py +3 -3
  7. build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py +24 -0
  8. build/torch25-cxx11-cu121-x86_64-linux/moe/layers.py +10 -0
  9. build/torch25-cxx11-cu124-x86_64-linux/moe/{_moe_21a4db0.abi3.so β†’ _moe_2218ad7.abi3.so} +1 -1
  10. build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py +3 -3
  11. build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py +24 -0
  12. build/torch25-cxx11-cu124-x86_64-linux/moe/layers.py +10 -0
  13. build/{torch26-cxx98-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so β†’ torch25-cxx98-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so} +1 -1
  14. build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py +3 -3
  15. build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py +24 -0
  16. build/torch25-cxx98-cu118-x86_64-linux/moe/layers.py +10 -0
  17. build/torch25-cxx98-cu121-x86_64-linux/moe/{_moe_21a4db0.abi3.so β†’ _moe_2218ad7.abi3.so} +1 -1
  18. build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py +3 -3
  19. build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py +24 -0
  20. build/torch25-cxx98-cu121-x86_64-linux/moe/layers.py +10 -0
  21. build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so +0 -3
  22. build/{torch25-cxx98-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so β†’ torch25-cxx98-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so} +2 -2
  23. build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py +3 -3
  24. build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py +24 -0
  25. build/torch25-cxx98-cu124-x86_64-linux/moe/layers.py +10 -0
  26. build/torch26-cxx11-cu118-x86_64-linux/moe/{_moe_21a4db0.abi3.so β†’ _moe_2218ad7.abi3.so} +1 -1
  27. build/torch26-cxx11-cu118-x86_64-linux/moe/_ops.py +3 -3
  28. build/torch26-cxx11-cu118-x86_64-linux/moe/fused_moe.py +24 -0
  29. build/torch26-cxx11-cu118-x86_64-linux/moe/layers.py +10 -0
  30. build/torch26-cxx11-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so +0 -3
  31. build/torch26-cxx11-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so +3 -0
  32. build/torch26-cxx11-cu124-x86_64-linux/moe/_ops.py +3 -3
  33. build/torch26-cxx11-cu124-x86_64-linux/moe/fused_moe.py +24 -0
  34. build/torch26-cxx11-cu124-x86_64-linux/moe/layers.py +10 -0
  35. build/torch26-cxx11-cu126-x86_64-linux/moe/_moe_21a4db0.abi3.so +0 -3
  36. build/torch26-cxx11-cu126-x86_64-linux/moe/_moe_2218ad7.abi3.so +3 -0
  37. build/torch26-cxx11-cu126-x86_64-linux/moe/_ops.py +3 -3
  38. build/torch26-cxx11-cu126-x86_64-linux/moe/fused_moe.py +24 -0
  39. build/torch26-cxx11-cu126-x86_64-linux/moe/layers.py +10 -0
  40. build/torch26-cxx98-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so +3 -0
  41. build/torch26-cxx98-cu118-x86_64-linux/moe/_ops.py +3 -3
  42. build/torch26-cxx98-cu118-x86_64-linux/moe/fused_moe.py +24 -0
  43. build/torch26-cxx98-cu118-x86_64-linux/moe/layers.py +10 -0
  44. build/torch26-cxx98-cu124-x86_64-linux/moe/{_moe_21a4db0.abi3.so β†’ _moe_2218ad7.abi3.so} +1 -1
  45. build/torch26-cxx98-cu124-x86_64-linux/moe/_ops.py +3 -3
  46. build/torch26-cxx98-cu124-x86_64-linux/moe/fused_moe.py +24 -0
  47. build/torch26-cxx98-cu124-x86_64-linux/moe/layers.py +10 -0
  48. build/torch26-cxx98-cu126-x86_64-linux/moe/{_moe_21a4db0.abi3.so β†’ _moe_2218ad7.abi3.so} +1 -1
  49. build/torch26-cxx98-cu126-x86_64-linux/moe/_ops.py +3 -3
  50. build/torch26-cxx98-cu126-x86_64-linux/moe/fused_moe.py +24 -0
build/torch25-cxx11-cu118-x86_64-linux/moe/{_moe_21a4db0.abi3.so β†’ _moe_2218ad7.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:03298359c23e496df84a18978298d3372423f7733fb8185b2f6a535d25a64a7e
3
- size 87060424
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49dc6c1d936b3dc6c483a4ef5d581c5d2f08f50f6ea2ffcdbfecdf0b719c3410
3
+ size 87056328
build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _moe_21a4db0
3
- ops = torch.ops._moe_21a4db0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_moe_21a4db0::{op_name}"
 
1
  import torch
2
+ from . import _moe_2218ad7
3
+ ops = torch.ops._moe_2218ad7
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_moe_2218ad7::{op_name}"
build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py CHANGED
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
27
  )
28
 
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  @triton.jit
31
  def write_zeros_to_output(
32
  c_ptr,
 
27
  )
28
 
29
 
30
+ def cdiv(a: int, b: int) -> int:
31
+ """Ceiling division."""
32
+ return -(a // -b)
33
+
34
+
35
+ def _fp8_quantize(
36
+ A: torch.Tensor,
37
+ A_scale: Optional[torch.Tensor],
38
+ block_shape: Optional[List[int]],
39
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
40
+ """
41
+ Perform fp8 quantization on the inputs. If a block_shape
42
+ is provided, the output will be blocked.
43
+ """
44
+ if block_shape is None:
45
+ A, A_scale = scaled_fp8_quant(A, A_scale)
46
+ else:
47
+ assert len(block_shape) == 2
48
+ _, block_k = block_shape[0], block_shape[1]
49
+ A, A_scale = per_token_group_quant_fp8(A, block_k)
50
+ assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
51
+ return A, A_scale
52
+
53
+
54
  @triton.jit
55
  def write_zeros_to_output(
56
  c_ptr,
build/torch25-cxx11-cu118-x86_64-linux/moe/layers.py CHANGED
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
36
  _fix_llama4_experts(hidden_states, self.experts)
37
 
38
  router_logits = self.router(hidden_states)
 
 
 
 
 
 
 
 
39
  out = fused_moe(
40
  hidden_states,
41
  w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
45
  renormalize=False,
46
  custom_routing_function=_llama4_topk,
47
  apply_router_weight_on_input=True,
 
 
48
  )
49
 
50
  out += self.shared_expert(hidden_states)
 
36
  _fix_llama4_experts(hidden_states, self.experts)
37
 
38
  router_logits = self.router(hidden_states)
39
+
40
+ extra_kwargs = {}
41
+ use_fp8_w8a8 = False
42
+ if hasattr(self.experts, "gate_up_proj_scale"):
43
+ use_fp8_w8a8 = True
44
+ extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
45
+ extra_kwargs["w2_scale"] = self.experts.down_proj_scale
46
+
47
  out = fused_moe(
48
  hidden_states,
49
  w1=self.experts.gate_up_proj,
 
53
  renormalize=False,
54
  custom_routing_function=_llama4_topk,
55
  apply_router_weight_on_input=True,
56
+ use_fp8_w8a8=use_fp8_w8a8,
57
+ **extra_kwargs
58
  )
59
 
60
  out += self.shared_expert(hidden_states)
build/torch25-cxx11-cu121-x86_64-linux/moe/{_moe_21a4db0.abi3.so β†’ _moe_2218ad7.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:459eff1ecfb7a0b08c7733cf7d87073d44c1cc2c123db5437fc2d96699968b24
3
  size 87254968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12bb26a0a9a47039bbcbf2c5fda7c068211cb711827b0e0e0d98b2fe99ed3b54
3
  size 87254968
build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _moe_21a4db0
3
- ops = torch.ops._moe_21a4db0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_moe_21a4db0::{op_name}"
 
1
  import torch
2
+ from . import _moe_2218ad7
3
+ ops = torch.ops._moe_2218ad7
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_moe_2218ad7::{op_name}"
build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py CHANGED
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
27
  )
28
 
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  @triton.jit
31
  def write_zeros_to_output(
32
  c_ptr,
 
27
  )
28
 
29
 
30
+ def cdiv(a: int, b: int) -> int:
31
+ """Ceiling division."""
32
+ return -(a // -b)
33
+
34
+
35
+ def _fp8_quantize(
36
+ A: torch.Tensor,
37
+ A_scale: Optional[torch.Tensor],
38
+ block_shape: Optional[List[int]],
39
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
40
+ """
41
+ Perform fp8 quantization on the inputs. If a block_shape
42
+ is provided, the output will be blocked.
43
+ """
44
+ if block_shape is None:
45
+ A, A_scale = scaled_fp8_quant(A, A_scale)
46
+ else:
47
+ assert len(block_shape) == 2
48
+ _, block_k = block_shape[0], block_shape[1]
49
+ A, A_scale = per_token_group_quant_fp8(A, block_k)
50
+ assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
51
+ return A, A_scale
52
+
53
+
54
  @triton.jit
55
  def write_zeros_to_output(
56
  c_ptr,
build/torch25-cxx11-cu121-x86_64-linux/moe/layers.py CHANGED
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
36
  _fix_llama4_experts(hidden_states, self.experts)
37
 
38
  router_logits = self.router(hidden_states)
 
 
 
 
 
 
 
 
39
  out = fused_moe(
40
  hidden_states,
41
  w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
45
  renormalize=False,
46
  custom_routing_function=_llama4_topk,
47
  apply_router_weight_on_input=True,
 
 
48
  )
49
 
50
  out += self.shared_expert(hidden_states)
 
36
  _fix_llama4_experts(hidden_states, self.experts)
37
 
38
  router_logits = self.router(hidden_states)
39
+
40
+ extra_kwargs = {}
41
+ use_fp8_w8a8 = False
42
+ if hasattr(self.experts, "gate_up_proj_scale"):
43
+ use_fp8_w8a8 = True
44
+ extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
45
+ extra_kwargs["w2_scale"] = self.experts.down_proj_scale
46
+
47
  out = fused_moe(
48
  hidden_states,
49
  w1=self.experts.gate_up_proj,
 
53
  renormalize=False,
54
  custom_routing_function=_llama4_topk,
55
  apply_router_weight_on_input=True,
56
+ use_fp8_w8a8=use_fp8_w8a8,
57
+ **extra_kwargs
58
  )
59
 
60
  out += self.shared_expert(hidden_states)
build/torch25-cxx11-cu124-x86_64-linux/moe/{_moe_21a4db0.abi3.so β†’ _moe_2218ad7.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab56f79975737060a41592e8be9f56d92e599b04d3cb9a95c8f515afb341b6a2
3
  size 86965608
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca9a24c28dab4109a13549ee7ce379b36d950930b8bd106669188262863f3795
3
  size 86965608
build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _moe_21a4db0
3
- ops = torch.ops._moe_21a4db0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_moe_21a4db0::{op_name}"
 
1
  import torch
2
+ from . import _moe_2218ad7
3
+ ops = torch.ops._moe_2218ad7
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_moe_2218ad7::{op_name}"
build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py CHANGED
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
27
  )
28
 
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  @triton.jit
31
  def write_zeros_to_output(
32
  c_ptr,
 
27
  )
28
 
29
 
30
+ def cdiv(a: int, b: int) -> int:
31
+ """Ceiling division."""
32
+ return -(a // -b)
33
+
34
+
35
+ def _fp8_quantize(
36
+ A: torch.Tensor,
37
+ A_scale: Optional[torch.Tensor],
38
+ block_shape: Optional[List[int]],
39
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
40
+ """
41
+ Perform fp8 quantization on the inputs. If a block_shape
42
+ is provided, the output will be blocked.
43
+ """
44
+ if block_shape is None:
45
+ A, A_scale = scaled_fp8_quant(A, A_scale)
46
+ else:
47
+ assert len(block_shape) == 2
48
+ _, block_k = block_shape[0], block_shape[1]
49
+ A, A_scale = per_token_group_quant_fp8(A, block_k)
50
+ assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
51
+ return A, A_scale
52
+
53
+
54
  @triton.jit
55
  def write_zeros_to_output(
56
  c_ptr,
build/torch25-cxx11-cu124-x86_64-linux/moe/layers.py CHANGED
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
36
  _fix_llama4_experts(hidden_states, self.experts)
37
 
38
  router_logits = self.router(hidden_states)
 
 
 
 
 
 
 
 
39
  out = fused_moe(
40
  hidden_states,
41
  w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
45
  renormalize=False,
46
  custom_routing_function=_llama4_topk,
47
  apply_router_weight_on_input=True,
 
 
48
  )
49
 
50
  out += self.shared_expert(hidden_states)
 
36
  _fix_llama4_experts(hidden_states, self.experts)
37
 
38
  router_logits = self.router(hidden_states)
39
+
40
+ extra_kwargs = {}
41
+ use_fp8_w8a8 = False
42
+ if hasattr(self.experts, "gate_up_proj_scale"):
43
+ use_fp8_w8a8 = True
44
+ extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
45
+ extra_kwargs["w2_scale"] = self.experts.down_proj_scale
46
+
47
  out = fused_moe(
48
  hidden_states,
49
  w1=self.experts.gate_up_proj,
 
53
  renormalize=False,
54
  custom_routing_function=_llama4_topk,
55
  apply_router_weight_on_input=True,
56
+ use_fp8_w8a8=use_fp8_w8a8,
57
+ **extra_kwargs
58
  )
59
 
60
  out += self.shared_expert(hidden_states)
build/{torch26-cxx98-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so β†’ torch25-cxx98-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5e0d9837b95dde6e7286a15e312db47fbc302850ca6ea5610fb306ba42bfbab7
3
  size 87048408
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d65d3a08c44b65a44d2c58566aa7e26e85d0d949be71096e09f7ad73d0b5e040
3
  size 87048408
build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _moe_21a4db0
3
- ops = torch.ops._moe_21a4db0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_moe_21a4db0::{op_name}"
 
1
  import torch
2
+ from . import _moe_2218ad7
3
+ ops = torch.ops._moe_2218ad7
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_moe_2218ad7::{op_name}"
build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py CHANGED
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
27
  )
28
 
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  @triton.jit
31
  def write_zeros_to_output(
32
  c_ptr,
 
27
  )
28
 
29
 
30
+ def cdiv(a: int, b: int) -> int:
31
+ """Ceiling division."""
32
+ return -(a // -b)
33
+
34
+
35
+ def _fp8_quantize(
36
+ A: torch.Tensor,
37
+ A_scale: Optional[torch.Tensor],
38
+ block_shape: Optional[List[int]],
39
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
40
+ """
41
+ Perform fp8 quantization on the inputs. If a block_shape
42
+ is provided, the output will be blocked.
43
+ """
44
+ if block_shape is None:
45
+ A, A_scale = scaled_fp8_quant(A, A_scale)
46
+ else:
47
+ assert len(block_shape) == 2
48
+ _, block_k = block_shape[0], block_shape[1]
49
+ A, A_scale = per_token_group_quant_fp8(A, block_k)
50
+ assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
51
+ return A, A_scale
52
+
53
+
54
  @triton.jit
55
  def write_zeros_to_output(
56
  c_ptr,
build/torch25-cxx98-cu118-x86_64-linux/moe/layers.py CHANGED
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
36
  _fix_llama4_experts(hidden_states, self.experts)
37
 
38
  router_logits = self.router(hidden_states)
 
 
 
 
 
 
 
 
39
  out = fused_moe(
40
  hidden_states,
41
  w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
45
  renormalize=False,
46
  custom_routing_function=_llama4_topk,
47
  apply_router_weight_on_input=True,
 
 
48
  )
49
 
50
  out += self.shared_expert(hidden_states)
 
36
  _fix_llama4_experts(hidden_states, self.experts)
37
 
38
  router_logits = self.router(hidden_states)
39
+
40
+ extra_kwargs = {}
41
+ use_fp8_w8a8 = False
42
+ if hasattr(self.experts, "gate_up_proj_scale"):
43
+ use_fp8_w8a8 = True
44
+ extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
45
+ extra_kwargs["w2_scale"] = self.experts.down_proj_scale
46
+
47
  out = fused_moe(
48
  hidden_states,
49
  w1=self.experts.gate_up_proj,
 
53
  renormalize=False,
54
  custom_routing_function=_llama4_topk,
55
  apply_router_weight_on_input=True,
56
+ use_fp8_w8a8=use_fp8_w8a8,
57
+ **extra_kwargs
58
  )
59
 
60
  out += self.shared_expert(hidden_states)
build/torch25-cxx98-cu121-x86_64-linux/moe/{_moe_21a4db0.abi3.so β†’ _moe_2218ad7.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f616a904eda2f4a23a23556ef522449f9d1b111ea5a9c215d7d04c3ccb9345fe
3
  size 87243240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2d4157287a3e7979780f23a709eba01e787186bc32a5e56c0620b5429e9cfd3
3
  size 87243240
build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _moe_21a4db0
3
- ops = torch.ops._moe_21a4db0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_moe_21a4db0::{op_name}"
 
1
  import torch
2
+ from . import _moe_2218ad7
3
+ ops = torch.ops._moe_2218ad7
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_moe_2218ad7::{op_name}"
build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py CHANGED
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
27
  )
28
 
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  @triton.jit
31
  def write_zeros_to_output(
32
  c_ptr,
 
27
  )
28
 
29
 
30
+ def cdiv(a: int, b: int) -> int:
31
+ """Ceiling division."""
32
+ return -(a // -b)
33
+
34
+
35
+ def _fp8_quantize(
36
+ A: torch.Tensor,
37
+ A_scale: Optional[torch.Tensor],
38
+ block_shape: Optional[List[int]],
39
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
40
+ """
41
+ Perform fp8 quantization on the inputs. If a block_shape
42
+ is provided, the output will be blocked.
43
+ """
44
+ if block_shape is None:
45
+ A, A_scale = scaled_fp8_quant(A, A_scale)
46
+ else:
47
+ assert len(block_shape) == 2
48
+ _, block_k = block_shape[0], block_shape[1]
49
+ A, A_scale = per_token_group_quant_fp8(A, block_k)
50
+ assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
51
+ return A, A_scale
52
+
53
+
54
  @triton.jit
55
  def write_zeros_to_output(
56
  c_ptr,
build/torch25-cxx98-cu121-x86_64-linux/moe/layers.py CHANGED
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
36
  _fix_llama4_experts(hidden_states, self.experts)
37
 
38
  router_logits = self.router(hidden_states)
 
 
 
 
 
 
 
 
39
  out = fused_moe(
40
  hidden_states,
41
  w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
45
  renormalize=False,
46
  custom_routing_function=_llama4_topk,
47
  apply_router_weight_on_input=True,
 
 
48
  )
49
 
50
  out += self.shared_expert(hidden_states)
 
36
  _fix_llama4_experts(hidden_states, self.experts)
37
 
38
  router_logits = self.router(hidden_states)
39
+
40
+ extra_kwargs = {}
41
+ use_fp8_w8a8 = False
42
+ if hasattr(self.experts, "gate_up_proj_scale"):
43
+ use_fp8_w8a8 = True
44
+ extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
45
+ extra_kwargs["w2_scale"] = self.experts.down_proj_scale
46
+
47
  out = fused_moe(
48
  hidden_states,
49
  w1=self.experts.gate_up_proj,
 
53
  renormalize=False,
54
  custom_routing_function=_llama4_topk,
55
  apply_router_weight_on_input=True,
56
+ use_fp8_w8a8=use_fp8_w8a8,
57
+ **extra_kwargs
58
  )
59
 
60
  out += self.shared_expert(hidden_states)
build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:820b62662956741ae78d7c51fb9fc978ff2e86c7dc1efa1335b0701e0e28749a
3
- size 86957976
 
 
 
 
build/{torch25-cxx98-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so β†’ torch25-cxx98-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:af44b6b8253f8b388158eda88cd12a59bec61aede5702ca684da10096e2708cb
3
- size 87052528
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:122544181246b179a772eb07c9e01c8df6b3025c20b333c566d0e84bfd7bea2d
3
+ size 86953880
build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _moe_21a4db0
3
- ops = torch.ops._moe_21a4db0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_moe_21a4db0::{op_name}"
 
1
  import torch
2
+ from . import _moe_2218ad7
3
+ ops = torch.ops._moe_2218ad7
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_moe_2218ad7::{op_name}"
build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py CHANGED
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
27
  )
28
 
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  @triton.jit
31
  def write_zeros_to_output(
32
  c_ptr,
 
27
  )
28
 
29
 
30
+ def cdiv(a: int, b: int) -> int:
31
+ """Ceiling division."""
32
+ return -(a // -b)
33
+
34
+
35
+ def _fp8_quantize(
36
+ A: torch.Tensor,
37
+ A_scale: Optional[torch.Tensor],
38
+ block_shape: Optional[List[int]],
39
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
40
+ """
41
+ Perform fp8 quantization on the inputs. If a block_shape
42
+ is provided, the output will be blocked.
43
+ """
44
+ if block_shape is None:
45
+ A, A_scale = scaled_fp8_quant(A, A_scale)
46
+ else:
47
+ assert len(block_shape) == 2
48
+ _, block_k = block_shape[0], block_shape[1]
49
+ A, A_scale = per_token_group_quant_fp8(A, block_k)
50
+ assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
51
+ return A, A_scale
52
+
53
+
54
  @triton.jit
55
  def write_zeros_to_output(
56
  c_ptr,
build/torch25-cxx98-cu124-x86_64-linux/moe/layers.py CHANGED
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
36
  _fix_llama4_experts(hidden_states, self.experts)
37
 
38
  router_logits = self.router(hidden_states)
 
 
 
 
 
 
 
 
39
  out = fused_moe(
40
  hidden_states,
41
  w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
45
  renormalize=False,
46
  custom_routing_function=_llama4_topk,
47
  apply_router_weight_on_input=True,
 
 
48
  )
49
 
50
  out += self.shared_expert(hidden_states)
 
36
  _fix_llama4_experts(hidden_states, self.experts)
37
 
38
  router_logits = self.router(hidden_states)
39
+
40
+ extra_kwargs = {}
41
+ use_fp8_w8a8 = False
42
+ if hasattr(self.experts, "gate_up_proj_scale"):
43
+ use_fp8_w8a8 = True
44
+ extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
45
+ extra_kwargs["w2_scale"] = self.experts.down_proj_scale
46
+
47
  out = fused_moe(
48
  hidden_states,
49
  w1=self.experts.gate_up_proj,
 
53
  renormalize=False,
54
  custom_routing_function=_llama4_topk,
55
  apply_router_weight_on_input=True,
56
+ use_fp8_w8a8=use_fp8_w8a8,
57
+ **extra_kwargs
58
  )
59
 
60
  out += self.shared_expert(hidden_states)
build/torch26-cxx11-cu118-x86_64-linux/moe/{_moe_21a4db0.abi3.so β†’ _moe_2218ad7.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c9916c6b52d9381c94b09b78f4f6756895e51563c0049f462363937f40c0114f
3
  size 87060352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49e17eb28438bddf98e314893cf262b807d64ee03850b46abe4d0bf6151f62b6
3
  size 87060352
build/torch26-cxx11-cu118-x86_64-linux/moe/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _moe_21a4db0
3
- ops = torch.ops._moe_21a4db0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_moe_21a4db0::{op_name}"
 
1
  import torch
2
+ from . import _moe_2218ad7
3
+ ops = torch.ops._moe_2218ad7
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_moe_2218ad7::{op_name}"
build/torch26-cxx11-cu118-x86_64-linux/moe/fused_moe.py CHANGED
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
27
  )
28
 
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  @triton.jit
31
  def write_zeros_to_output(
32
  c_ptr,
 
27
  )
28
 
29
 
30
+ def cdiv(a: int, b: int) -> int:
31
+ """Ceiling division."""
32
+ return -(a // -b)
33
+
34
+
35
+ def _fp8_quantize(
36
+ A: torch.Tensor,
37
+ A_scale: Optional[torch.Tensor],
38
+ block_shape: Optional[List[int]],
39
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
40
+ """
41
+ Perform fp8 quantization on the inputs. If a block_shape
42
+ is provided, the output will be blocked.
43
+ """
44
+ if block_shape is None:
45
+ A, A_scale = scaled_fp8_quant(A, A_scale)
46
+ else:
47
+ assert len(block_shape) == 2
48
+ _, block_k = block_shape[0], block_shape[1]
49
+ A, A_scale = per_token_group_quant_fp8(A, block_k)
50
+ assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
51
+ return A, A_scale
52
+
53
+
54
  @triton.jit
55
  def write_zeros_to_output(
56
  c_ptr,
build/torch26-cxx11-cu118-x86_64-linux/moe/layers.py CHANGED
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
36
  _fix_llama4_experts(hidden_states, self.experts)
37
 
38
  router_logits = self.router(hidden_states)
 
 
 
 
 
 
 
 
39
  out = fused_moe(
40
  hidden_states,
41
  w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
45
  renormalize=False,
46
  custom_routing_function=_llama4_topk,
47
  apply_router_weight_on_input=True,
 
 
48
  )
49
 
50
  out += self.shared_expert(hidden_states)
 
36
  _fix_llama4_experts(hidden_states, self.experts)
37
 
38
  router_logits = self.router(hidden_states)
39
+
40
+ extra_kwargs = {}
41
+ use_fp8_w8a8 = False
42
+ if hasattr(self.experts, "gate_up_proj_scale"):
43
+ use_fp8_w8a8 = True
44
+ extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
45
+ extra_kwargs["w2_scale"] = self.experts.down_proj_scale
46
+
47
  out = fused_moe(
48
  hidden_states,
49
  w1=self.experts.gate_up_proj,
 
53
  renormalize=False,
54
  custom_routing_function=_llama4_topk,
55
  apply_router_weight_on_input=True,
56
+ use_fp8_w8a8=use_fp8_w8a8,
57
+ **extra_kwargs
58
  )
59
 
60
  out += self.shared_expert(hidden_states)
build/torch26-cxx11-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9a9acc9198a56410e1d6bddec3a4529fb14b12843f6589b4477bc4ee795f7278
3
- size 86961568
 
 
 
 
build/torch26-cxx11-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f804164f561c9b46f3b997a6d13552ca4d704c43484b5cd8d14682b4450ed472
3
+ size 86965664
build/torch26-cxx11-cu124-x86_64-linux/moe/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _moe_21a4db0
3
- ops = torch.ops._moe_21a4db0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_moe_21a4db0::{op_name}"
 
1
  import torch
2
+ from . import _moe_2218ad7
3
+ ops = torch.ops._moe_2218ad7
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_moe_2218ad7::{op_name}"
build/torch26-cxx11-cu124-x86_64-linux/moe/fused_moe.py CHANGED
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
27
  )
28
 
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  @triton.jit
31
  def write_zeros_to_output(
32
  c_ptr,
 
27
  )
28
 
29
 
30
+ def cdiv(a: int, b: int) -> int:
31
+ """Ceiling division."""
32
+ return -(a // -b)
33
+
34
+
35
+ def _fp8_quantize(
36
+ A: torch.Tensor,
37
+ A_scale: Optional[torch.Tensor],
38
+ block_shape: Optional[List[int]],
39
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
40
+ """
41
+ Perform fp8 quantization on the inputs. If a block_shape
42
+ is provided, the output will be blocked.
43
+ """
44
+ if block_shape is None:
45
+ A, A_scale = scaled_fp8_quant(A, A_scale)
46
+ else:
47
+ assert len(block_shape) == 2
48
+ _, block_k = block_shape[0], block_shape[1]
49
+ A, A_scale = per_token_group_quant_fp8(A, block_k)
50
+ assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
51
+ return A, A_scale
52
+
53
+
54
  @triton.jit
55
  def write_zeros_to_output(
56
  c_ptr,
build/torch26-cxx11-cu124-x86_64-linux/moe/layers.py CHANGED
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
36
  _fix_llama4_experts(hidden_states, self.experts)
37
 
38
  router_logits = self.router(hidden_states)
 
 
 
 
 
 
 
 
39
  out = fused_moe(
40
  hidden_states,
41
  w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
45
  renormalize=False,
46
  custom_routing_function=_llama4_topk,
47
  apply_router_weight_on_input=True,
 
 
48
  )
49
 
50
  out += self.shared_expert(hidden_states)
 
36
  _fix_llama4_experts(hidden_states, self.experts)
37
 
38
  router_logits = self.router(hidden_states)
39
+
40
+ extra_kwargs = {}
41
+ use_fp8_w8a8 = False
42
+ if hasattr(self.experts, "gate_up_proj_scale"):
43
+ use_fp8_w8a8 = True
44
+ extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
45
+ extra_kwargs["w2_scale"] = self.experts.down_proj_scale
46
+
47
  out = fused_moe(
48
  hidden_states,
49
  w1=self.experts.gate_up_proj,
 
53
  renormalize=False,
54
  custom_routing_function=_llama4_topk,
55
  apply_router_weight_on_input=True,
56
+ use_fp8_w8a8=use_fp8_w8a8,
57
+ **extra_kwargs
58
  )
59
 
60
  out += self.shared_expert(hidden_states)
build/torch26-cxx11-cu126-x86_64-linux/moe/_moe_21a4db0.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb26fad3cfe2db1cc88637e020d6d8ddbc54df3e7e8edd64ba9370cd96177587
3
- size 87428864
 
 
 
 
build/torch26-cxx11-cu126-x86_64-linux/moe/_moe_2218ad7.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1102bf615b2d2f7c320ac73eed63b982e969683ac72c958080dddb87166fa595
3
+ size 87432960
build/torch26-cxx11-cu126-x86_64-linux/moe/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _moe_21a4db0
3
- ops = torch.ops._moe_21a4db0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_moe_21a4db0::{op_name}"
 
1
  import torch
2
+ from . import _moe_2218ad7
3
+ ops = torch.ops._moe_2218ad7
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_moe_2218ad7::{op_name}"
build/torch26-cxx11-cu126-x86_64-linux/moe/fused_moe.py CHANGED
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
27
  )
28
 
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  @triton.jit
31
  def write_zeros_to_output(
32
  c_ptr,
 
27
  )
28
 
29
 
30
+ def cdiv(a: int, b: int) -> int:
31
+ """Ceiling division."""
32
+ return -(a // -b)
33
+
34
+
35
+ def _fp8_quantize(
36
+ A: torch.Tensor,
37
+ A_scale: Optional[torch.Tensor],
38
+ block_shape: Optional[List[int]],
39
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
40
+ """
41
+ Perform fp8 quantization on the inputs. If a block_shape
42
+ is provided, the output will be blocked.
43
+ """
44
+ if block_shape is None:
45
+ A, A_scale = scaled_fp8_quant(A, A_scale)
46
+ else:
47
+ assert len(block_shape) == 2
48
+ _, block_k = block_shape[0], block_shape[1]
49
+ A, A_scale = per_token_group_quant_fp8(A, block_k)
50
+ assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
51
+ return A, A_scale
52
+
53
+
54
  @triton.jit
55
  def write_zeros_to_output(
56
  c_ptr,
build/torch26-cxx11-cu126-x86_64-linux/moe/layers.py CHANGED
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
36
  _fix_llama4_experts(hidden_states, self.experts)
37
 
38
  router_logits = self.router(hidden_states)
 
 
 
 
 
 
 
 
39
  out = fused_moe(
40
  hidden_states,
41
  w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
45
  renormalize=False,
46
  custom_routing_function=_llama4_topk,
47
  apply_router_weight_on_input=True,
 
 
48
  )
49
 
50
  out += self.shared_expert(hidden_states)
 
36
  _fix_llama4_experts(hidden_states, self.experts)
37
 
38
  router_logits = self.router(hidden_states)
39
+
40
+ extra_kwargs = {}
41
+ use_fp8_w8a8 = False
42
+ if hasattr(self.experts, "gate_up_proj_scale"):
43
+ use_fp8_w8a8 = True
44
+ extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
45
+ extra_kwargs["w2_scale"] = self.experts.down_proj_scale
46
+
47
  out = fused_moe(
48
  hidden_states,
49
  w1=self.experts.gate_up_proj,
 
53
  renormalize=False,
54
  custom_routing_function=_llama4_topk,
55
  apply_router_weight_on_input=True,
56
+ use_fp8_w8a8=use_fp8_w8a8,
57
+ **extra_kwargs
58
  )
59
 
60
  out += self.shared_expert(hidden_states)
build/torch26-cxx98-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e739bb546d3d1730fa7696fbd767fd588286dec369f1b7551edd1ec481df96f
3
+ size 87044288
build/torch26-cxx98-cu118-x86_64-linux/moe/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _moe_21a4db0
3
- ops = torch.ops._moe_21a4db0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_moe_21a4db0::{op_name}"
 
1
  import torch
2
+ from . import _moe_2218ad7
3
+ ops = torch.ops._moe_2218ad7
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_moe_2218ad7::{op_name}"
build/torch26-cxx98-cu118-x86_64-linux/moe/fused_moe.py CHANGED
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
27
  )
28
 
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  @triton.jit
31
  def write_zeros_to_output(
32
  c_ptr,
 
27
  )
28
 
29
 
30
+ def cdiv(a: int, b: int) -> int:
31
+ """Ceiling division."""
32
+ return -(a // -b)
33
+
34
+
35
+ def _fp8_quantize(
36
+ A: torch.Tensor,
37
+ A_scale: Optional[torch.Tensor],
38
+ block_shape: Optional[List[int]],
39
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
40
+ """
41
+ Perform fp8 quantization on the inputs. If a block_shape
42
+ is provided, the output will be blocked.
43
+ """
44
+ if block_shape is None:
45
+ A, A_scale = scaled_fp8_quant(A, A_scale)
46
+ else:
47
+ assert len(block_shape) == 2
48
+ _, block_k = block_shape[0], block_shape[1]
49
+ A, A_scale = per_token_group_quant_fp8(A, block_k)
50
+ assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
51
+ return A, A_scale
52
+
53
+
54
  @triton.jit
55
  def write_zeros_to_output(
56
  c_ptr,
build/torch26-cxx98-cu118-x86_64-linux/moe/layers.py CHANGED
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
36
  _fix_llama4_experts(hidden_states, self.experts)
37
 
38
  router_logits = self.router(hidden_states)
 
 
 
 
 
 
 
 
39
  out = fused_moe(
40
  hidden_states,
41
  w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
45
  renormalize=False,
46
  custom_routing_function=_llama4_topk,
47
  apply_router_weight_on_input=True,
 
 
48
  )
49
 
50
  out += self.shared_expert(hidden_states)
 
36
  _fix_llama4_experts(hidden_states, self.experts)
37
 
38
  router_logits = self.router(hidden_states)
39
+
40
+ extra_kwargs = {}
41
+ use_fp8_w8a8 = False
42
+ if hasattr(self.experts, "gate_up_proj_scale"):
43
+ use_fp8_w8a8 = True
44
+ extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
45
+ extra_kwargs["w2_scale"] = self.experts.down_proj_scale
46
+
47
  out = fused_moe(
48
  hidden_states,
49
  w1=self.experts.gate_up_proj,
 
53
  renormalize=False,
54
  custom_routing_function=_llama4_topk,
55
  apply_router_weight_on_input=True,
56
+ use_fp8_w8a8=use_fp8_w8a8,
57
+ **extra_kwargs
58
  )
59
 
60
  out += self.shared_expert(hidden_states)
build/torch26-cxx98-cu124-x86_64-linux/moe/{_moe_21a4db0.abi3.so β†’ _moe_2218ad7.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f7abba352c56f674e909a701b965b1508babdf69e9a6dda54fac6f11088d0ac2
3
  size 86953856
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcb950d2e7196ad22cad926749b7e0e06e5454f0a732755b72f0b8dd456529c6
3
  size 86953856
build/torch26-cxx98-cu124-x86_64-linux/moe/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _moe_21a4db0
3
- ops = torch.ops._moe_21a4db0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_moe_21a4db0::{op_name}"
 
1
  import torch
2
+ from . import _moe_2218ad7
3
+ ops = torch.ops._moe_2218ad7
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_moe_2218ad7::{op_name}"
build/torch26-cxx98-cu124-x86_64-linux/moe/fused_moe.py CHANGED
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
27
  )
28
 
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  @triton.jit
31
  def write_zeros_to_output(
32
  c_ptr,
 
27
  )
28
 
29
 
30
+ def cdiv(a: int, b: int) -> int:
31
+ """Ceiling division."""
32
+ return -(a // -b)
33
+
34
+
35
+ def _fp8_quantize(
36
+ A: torch.Tensor,
37
+ A_scale: Optional[torch.Tensor],
38
+ block_shape: Optional[List[int]],
39
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
40
+ """
41
+ Perform fp8 quantization on the inputs. If a block_shape
42
+ is provided, the output will be blocked.
43
+ """
44
+ if block_shape is None:
45
+ A, A_scale = scaled_fp8_quant(A, A_scale)
46
+ else:
47
+ assert len(block_shape) == 2
48
+ _, block_k = block_shape[0], block_shape[1]
49
+ A, A_scale = per_token_group_quant_fp8(A, block_k)
50
+ assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
51
+ return A, A_scale
52
+
53
+
54
  @triton.jit
55
  def write_zeros_to_output(
56
  c_ptr,
build/torch26-cxx98-cu124-x86_64-linux/moe/layers.py CHANGED
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
36
  _fix_llama4_experts(hidden_states, self.experts)
37
 
38
  router_logits = self.router(hidden_states)
 
 
 
 
 
 
 
 
39
  out = fused_moe(
40
  hidden_states,
41
  w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
45
  renormalize=False,
46
  custom_routing_function=_llama4_topk,
47
  apply_router_weight_on_input=True,
 
 
48
  )
49
 
50
  out += self.shared_expert(hidden_states)
 
36
  _fix_llama4_experts(hidden_states, self.experts)
37
 
38
  router_logits = self.router(hidden_states)
39
+
40
+ extra_kwargs = {}
41
+ use_fp8_w8a8 = False
42
+ if hasattr(self.experts, "gate_up_proj_scale"):
43
+ use_fp8_w8a8 = True
44
+ extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
45
+ extra_kwargs["w2_scale"] = self.experts.down_proj_scale
46
+
47
  out = fused_moe(
48
  hidden_states,
49
  w1=self.experts.gate_up_proj,
 
53
  renormalize=False,
54
  custom_routing_function=_llama4_topk,
55
  apply_router_weight_on_input=True,
56
+ use_fp8_w8a8=use_fp8_w8a8,
57
+ **extra_kwargs
58
  )
59
 
60
  out += self.shared_expert(hidden_states)
build/torch26-cxx98-cu126-x86_64-linux/moe/{_moe_21a4db0.abi3.so β†’ _moe_2218ad7.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6568844b9365cdbe7fbdddfda668e49f7c780028988c5bae2d48a72eeba1650b
3
  size 87417064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe5c605f1da902aebc1d7ce0355b649fcfcc44aed0023fdc87974f3d56273897
3
  size 87417064
build/torch26-cxx98-cu126-x86_64-linux/moe/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _moe_21a4db0
3
- ops = torch.ops._moe_21a4db0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_moe_21a4db0::{op_name}"
 
1
  import torch
2
+ from . import _moe_2218ad7
3
+ ops = torch.ops._moe_2218ad7
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_moe_2218ad7::{op_name}"
build/torch26-cxx98-cu126-x86_64-linux/moe/fused_moe.py CHANGED
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
27
  )
28
 
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  @triton.jit
31
  def write_zeros_to_output(
32
  c_ptr,
 
27
  )
28
 
29
 
30
+ def cdiv(a: int, b: int) -> int:
31
+ """Ceiling division."""
32
+ return -(a // -b)
33
+
34
+
35
+ def _fp8_quantize(
36
+ A: torch.Tensor,
37
+ A_scale: Optional[torch.Tensor],
38
+ block_shape: Optional[List[int]],
39
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
40
+ """
41
+ Perform fp8 quantization on the inputs. If a block_shape
42
+ is provided, the output will be blocked.
43
+ """
44
+ if block_shape is None:
45
+ A, A_scale = scaled_fp8_quant(A, A_scale)
46
+ else:
47
+ assert len(block_shape) == 2
48
+ _, block_k = block_shape[0], block_shape[1]
49
+ A, A_scale = per_token_group_quant_fp8(A, block_k)
50
+ assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
51
+ return A, A_scale
52
+
53
+
54
  @triton.jit
55
  def write_zeros_to_output(
56
  c_ptr,