Build (fp8)
Browse filesThis view is limited to 50 files because it contains too many changes. Β
See raw diff
- build/torch25-cxx11-cu118-x86_64-linux/moe/{_moe_21a4db0.abi3.so β _moe_2218ad7.abi3.so} +2 -2
- build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py +3 -3
- build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py +24 -0
- build/torch25-cxx11-cu118-x86_64-linux/moe/layers.py +10 -0
- build/torch25-cxx11-cu121-x86_64-linux/moe/{_moe_21a4db0.abi3.so β _moe_2218ad7.abi3.so} +1 -1
- build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py +3 -3
- build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py +24 -0
- build/torch25-cxx11-cu121-x86_64-linux/moe/layers.py +10 -0
- build/torch25-cxx11-cu124-x86_64-linux/moe/{_moe_21a4db0.abi3.so β _moe_2218ad7.abi3.so} +1 -1
- build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py +3 -3
- build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py +24 -0
- build/torch25-cxx11-cu124-x86_64-linux/moe/layers.py +10 -0
- build/{torch26-cxx98-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so β torch25-cxx98-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so} +1 -1
- build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py +3 -3
- build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py +24 -0
- build/torch25-cxx98-cu118-x86_64-linux/moe/layers.py +10 -0
- build/torch25-cxx98-cu121-x86_64-linux/moe/{_moe_21a4db0.abi3.so β _moe_2218ad7.abi3.so} +1 -1
- build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py +3 -3
- build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py +24 -0
- build/torch25-cxx98-cu121-x86_64-linux/moe/layers.py +10 -0
- build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so +0 -3
- build/{torch25-cxx98-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so β torch25-cxx98-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so} +2 -2
- build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py +3 -3
- build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py +24 -0
- build/torch25-cxx98-cu124-x86_64-linux/moe/layers.py +10 -0
- build/torch26-cxx11-cu118-x86_64-linux/moe/{_moe_21a4db0.abi3.so β _moe_2218ad7.abi3.so} +1 -1
- build/torch26-cxx11-cu118-x86_64-linux/moe/_ops.py +3 -3
- build/torch26-cxx11-cu118-x86_64-linux/moe/fused_moe.py +24 -0
- build/torch26-cxx11-cu118-x86_64-linux/moe/layers.py +10 -0
- build/torch26-cxx11-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so +0 -3
- build/torch26-cxx11-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so +3 -0
- build/torch26-cxx11-cu124-x86_64-linux/moe/_ops.py +3 -3
- build/torch26-cxx11-cu124-x86_64-linux/moe/fused_moe.py +24 -0
- build/torch26-cxx11-cu124-x86_64-linux/moe/layers.py +10 -0
- build/torch26-cxx11-cu126-x86_64-linux/moe/_moe_21a4db0.abi3.so +0 -3
- build/torch26-cxx11-cu126-x86_64-linux/moe/_moe_2218ad7.abi3.so +3 -0
- build/torch26-cxx11-cu126-x86_64-linux/moe/_ops.py +3 -3
- build/torch26-cxx11-cu126-x86_64-linux/moe/fused_moe.py +24 -0
- build/torch26-cxx11-cu126-x86_64-linux/moe/layers.py +10 -0
- build/torch26-cxx98-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so +3 -0
- build/torch26-cxx98-cu118-x86_64-linux/moe/_ops.py +3 -3
- build/torch26-cxx98-cu118-x86_64-linux/moe/fused_moe.py +24 -0
- build/torch26-cxx98-cu118-x86_64-linux/moe/layers.py +10 -0
- build/torch26-cxx98-cu124-x86_64-linux/moe/{_moe_21a4db0.abi3.so β _moe_2218ad7.abi3.so} +1 -1
- build/torch26-cxx98-cu124-x86_64-linux/moe/_ops.py +3 -3
- build/torch26-cxx98-cu124-x86_64-linux/moe/fused_moe.py +24 -0
- build/torch26-cxx98-cu124-x86_64-linux/moe/layers.py +10 -0
- build/torch26-cxx98-cu126-x86_64-linux/moe/{_moe_21a4db0.abi3.so β _moe_2218ad7.abi3.so} +1 -1
- build/torch26-cxx98-cu126-x86_64-linux/moe/_ops.py +3 -3
- build/torch26-cxx98-cu126-x86_64-linux/moe/fused_moe.py +24 -0
build/torch25-cxx11-cu118-x86_64-linux/moe/{_moe_21a4db0.abi3.so β _moe_2218ad7.abi3.so}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:49dc6c1d936b3dc6c483a4ef5d581c5d2f08f50f6ea2ffcdbfecdf0b719c3410
|
3 |
+
size 87056328
|
build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import torch
|
2 |
-
from . import
|
3 |
-
ops = torch.ops.
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
-
return f"
|
|
|
1 |
import torch
|
2 |
+
from . import _moe_2218ad7
|
3 |
+
ops = torch.ops._moe_2218ad7
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
+
return f"_moe_2218ad7::{op_name}"
|
build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py
CHANGED
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
|
|
27 |
)
|
28 |
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
@triton.jit
|
31 |
def write_zeros_to_output(
|
32 |
c_ptr,
|
|
|
27 |
)
|
28 |
|
29 |
|
30 |
+
def cdiv(a: int, b: int) -> int:
|
31 |
+
"""Ceiling division."""
|
32 |
+
return -(a // -b)
|
33 |
+
|
34 |
+
|
35 |
+
def _fp8_quantize(
|
36 |
+
A: torch.Tensor,
|
37 |
+
A_scale: Optional[torch.Tensor],
|
38 |
+
block_shape: Optional[List[int]],
|
39 |
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
40 |
+
"""
|
41 |
+
Perform fp8 quantization on the inputs. If a block_shape
|
42 |
+
is provided, the output will be blocked.
|
43 |
+
"""
|
44 |
+
if block_shape is None:
|
45 |
+
A, A_scale = scaled_fp8_quant(A, A_scale)
|
46 |
+
else:
|
47 |
+
assert len(block_shape) == 2
|
48 |
+
_, block_k = block_shape[0], block_shape[1]
|
49 |
+
A, A_scale = per_token_group_quant_fp8(A, block_k)
|
50 |
+
assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
|
51 |
+
return A, A_scale
|
52 |
+
|
53 |
+
|
54 |
@triton.jit
|
55 |
def write_zeros_to_output(
|
56 |
c_ptr,
|
build/torch25-cxx11-cu118-x86_64-linux/moe/layers.py
CHANGED
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
|
|
36 |
_fix_llama4_experts(hidden_states, self.experts)
|
37 |
|
38 |
router_logits = self.router(hidden_states)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
out = fused_moe(
|
40 |
hidden_states,
|
41 |
w1=self.experts.gate_up_proj,
|
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
|
|
45 |
renormalize=False,
|
46 |
custom_routing_function=_llama4_topk,
|
47 |
apply_router_weight_on_input=True,
|
|
|
|
|
48 |
)
|
49 |
|
50 |
out += self.shared_expert(hidden_states)
|
|
|
36 |
_fix_llama4_experts(hidden_states, self.experts)
|
37 |
|
38 |
router_logits = self.router(hidden_states)
|
39 |
+
|
40 |
+
extra_kwargs = {}
|
41 |
+
use_fp8_w8a8 = False
|
42 |
+
if hasattr(self.experts, "gate_up_proj_scale"):
|
43 |
+
use_fp8_w8a8 = True
|
44 |
+
extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
|
45 |
+
extra_kwargs["w2_scale"] = self.experts.down_proj_scale
|
46 |
+
|
47 |
out = fused_moe(
|
48 |
hidden_states,
|
49 |
w1=self.experts.gate_up_proj,
|
|
|
53 |
renormalize=False,
|
54 |
custom_routing_function=_llama4_topk,
|
55 |
apply_router_weight_on_input=True,
|
56 |
+
use_fp8_w8a8=use_fp8_w8a8,
|
57 |
+
**extra_kwargs
|
58 |
)
|
59 |
|
60 |
out += self.shared_expert(hidden_states)
|
build/torch25-cxx11-cu121-x86_64-linux/moe/{_moe_21a4db0.abi3.so β _moe_2218ad7.abi3.so}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 87254968
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:12bb26a0a9a47039bbcbf2c5fda7c068211cb711827b0e0e0d98b2fe99ed3b54
|
3 |
size 87254968
|
build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import torch
|
2 |
-
from . import
|
3 |
-
ops = torch.ops.
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
-
return f"
|
|
|
1 |
import torch
|
2 |
+
from . import _moe_2218ad7
|
3 |
+
ops = torch.ops._moe_2218ad7
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
+
return f"_moe_2218ad7::{op_name}"
|
build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py
CHANGED
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
|
|
27 |
)
|
28 |
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
@triton.jit
|
31 |
def write_zeros_to_output(
|
32 |
c_ptr,
|
|
|
27 |
)
|
28 |
|
29 |
|
30 |
+
def cdiv(a: int, b: int) -> int:
|
31 |
+
"""Ceiling division."""
|
32 |
+
return -(a // -b)
|
33 |
+
|
34 |
+
|
35 |
+
def _fp8_quantize(
|
36 |
+
A: torch.Tensor,
|
37 |
+
A_scale: Optional[torch.Tensor],
|
38 |
+
block_shape: Optional[List[int]],
|
39 |
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
40 |
+
"""
|
41 |
+
Perform fp8 quantization on the inputs. If a block_shape
|
42 |
+
is provided, the output will be blocked.
|
43 |
+
"""
|
44 |
+
if block_shape is None:
|
45 |
+
A, A_scale = scaled_fp8_quant(A, A_scale)
|
46 |
+
else:
|
47 |
+
assert len(block_shape) == 2
|
48 |
+
_, block_k = block_shape[0], block_shape[1]
|
49 |
+
A, A_scale = per_token_group_quant_fp8(A, block_k)
|
50 |
+
assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
|
51 |
+
return A, A_scale
|
52 |
+
|
53 |
+
|
54 |
@triton.jit
|
55 |
def write_zeros_to_output(
|
56 |
c_ptr,
|
build/torch25-cxx11-cu121-x86_64-linux/moe/layers.py
CHANGED
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
|
|
36 |
_fix_llama4_experts(hidden_states, self.experts)
|
37 |
|
38 |
router_logits = self.router(hidden_states)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
out = fused_moe(
|
40 |
hidden_states,
|
41 |
w1=self.experts.gate_up_proj,
|
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
|
|
45 |
renormalize=False,
|
46 |
custom_routing_function=_llama4_topk,
|
47 |
apply_router_weight_on_input=True,
|
|
|
|
|
48 |
)
|
49 |
|
50 |
out += self.shared_expert(hidden_states)
|
|
|
36 |
_fix_llama4_experts(hidden_states, self.experts)
|
37 |
|
38 |
router_logits = self.router(hidden_states)
|
39 |
+
|
40 |
+
extra_kwargs = {}
|
41 |
+
use_fp8_w8a8 = False
|
42 |
+
if hasattr(self.experts, "gate_up_proj_scale"):
|
43 |
+
use_fp8_w8a8 = True
|
44 |
+
extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
|
45 |
+
extra_kwargs["w2_scale"] = self.experts.down_proj_scale
|
46 |
+
|
47 |
out = fused_moe(
|
48 |
hidden_states,
|
49 |
w1=self.experts.gate_up_proj,
|
|
|
53 |
renormalize=False,
|
54 |
custom_routing_function=_llama4_topk,
|
55 |
apply_router_weight_on_input=True,
|
56 |
+
use_fp8_w8a8=use_fp8_w8a8,
|
57 |
+
**extra_kwargs
|
58 |
)
|
59 |
|
60 |
out += self.shared_expert(hidden_states)
|
build/torch25-cxx11-cu124-x86_64-linux/moe/{_moe_21a4db0.abi3.so β _moe_2218ad7.abi3.so}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 86965608
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ca9a24c28dab4109a13549ee7ce379b36d950930b8bd106669188262863f3795
|
3 |
size 86965608
|
build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import torch
|
2 |
-
from . import
|
3 |
-
ops = torch.ops.
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
-
return f"
|
|
|
1 |
import torch
|
2 |
+
from . import _moe_2218ad7
|
3 |
+
ops = torch.ops._moe_2218ad7
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
+
return f"_moe_2218ad7::{op_name}"
|
build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py
CHANGED
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
|
|
27 |
)
|
28 |
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
@triton.jit
|
31 |
def write_zeros_to_output(
|
32 |
c_ptr,
|
|
|
27 |
)
|
28 |
|
29 |
|
30 |
+
def cdiv(a: int, b: int) -> int:
|
31 |
+
"""Ceiling division."""
|
32 |
+
return -(a // -b)
|
33 |
+
|
34 |
+
|
35 |
+
def _fp8_quantize(
|
36 |
+
A: torch.Tensor,
|
37 |
+
A_scale: Optional[torch.Tensor],
|
38 |
+
block_shape: Optional[List[int]],
|
39 |
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
40 |
+
"""
|
41 |
+
Perform fp8 quantization on the inputs. If a block_shape
|
42 |
+
is provided, the output will be blocked.
|
43 |
+
"""
|
44 |
+
if block_shape is None:
|
45 |
+
A, A_scale = scaled_fp8_quant(A, A_scale)
|
46 |
+
else:
|
47 |
+
assert len(block_shape) == 2
|
48 |
+
_, block_k = block_shape[0], block_shape[1]
|
49 |
+
A, A_scale = per_token_group_quant_fp8(A, block_k)
|
50 |
+
assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
|
51 |
+
return A, A_scale
|
52 |
+
|
53 |
+
|
54 |
@triton.jit
|
55 |
def write_zeros_to_output(
|
56 |
c_ptr,
|
build/torch25-cxx11-cu124-x86_64-linux/moe/layers.py
CHANGED
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
|
|
36 |
_fix_llama4_experts(hidden_states, self.experts)
|
37 |
|
38 |
router_logits = self.router(hidden_states)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
out = fused_moe(
|
40 |
hidden_states,
|
41 |
w1=self.experts.gate_up_proj,
|
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
|
|
45 |
renormalize=False,
|
46 |
custom_routing_function=_llama4_topk,
|
47 |
apply_router_weight_on_input=True,
|
|
|
|
|
48 |
)
|
49 |
|
50 |
out += self.shared_expert(hidden_states)
|
|
|
36 |
_fix_llama4_experts(hidden_states, self.experts)
|
37 |
|
38 |
router_logits = self.router(hidden_states)
|
39 |
+
|
40 |
+
extra_kwargs = {}
|
41 |
+
use_fp8_w8a8 = False
|
42 |
+
if hasattr(self.experts, "gate_up_proj_scale"):
|
43 |
+
use_fp8_w8a8 = True
|
44 |
+
extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
|
45 |
+
extra_kwargs["w2_scale"] = self.experts.down_proj_scale
|
46 |
+
|
47 |
out = fused_moe(
|
48 |
hidden_states,
|
49 |
w1=self.experts.gate_up_proj,
|
|
|
53 |
renormalize=False,
|
54 |
custom_routing_function=_llama4_topk,
|
55 |
apply_router_weight_on_input=True,
|
56 |
+
use_fp8_w8a8=use_fp8_w8a8,
|
57 |
+
**extra_kwargs
|
58 |
)
|
59 |
|
60 |
out += self.shared_expert(hidden_states)
|
build/{torch26-cxx98-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so β torch25-cxx98-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 87048408
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d65d3a08c44b65a44d2c58566aa7e26e85d0d949be71096e09f7ad73d0b5e040
|
3 |
size 87048408
|
build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import torch
|
2 |
-
from . import
|
3 |
-
ops = torch.ops.
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
-
return f"
|
|
|
1 |
import torch
|
2 |
+
from . import _moe_2218ad7
|
3 |
+
ops = torch.ops._moe_2218ad7
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
+
return f"_moe_2218ad7::{op_name}"
|
build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py
CHANGED
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
|
|
27 |
)
|
28 |
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
@triton.jit
|
31 |
def write_zeros_to_output(
|
32 |
c_ptr,
|
|
|
27 |
)
|
28 |
|
29 |
|
30 |
+
def cdiv(a: int, b: int) -> int:
|
31 |
+
"""Ceiling division."""
|
32 |
+
return -(a // -b)
|
33 |
+
|
34 |
+
|
35 |
+
def _fp8_quantize(
|
36 |
+
A: torch.Tensor,
|
37 |
+
A_scale: Optional[torch.Tensor],
|
38 |
+
block_shape: Optional[List[int]],
|
39 |
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
40 |
+
"""
|
41 |
+
Perform fp8 quantization on the inputs. If a block_shape
|
42 |
+
is provided, the output will be blocked.
|
43 |
+
"""
|
44 |
+
if block_shape is None:
|
45 |
+
A, A_scale = scaled_fp8_quant(A, A_scale)
|
46 |
+
else:
|
47 |
+
assert len(block_shape) == 2
|
48 |
+
_, block_k = block_shape[0], block_shape[1]
|
49 |
+
A, A_scale = per_token_group_quant_fp8(A, block_k)
|
50 |
+
assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
|
51 |
+
return A, A_scale
|
52 |
+
|
53 |
+
|
54 |
@triton.jit
|
55 |
def write_zeros_to_output(
|
56 |
c_ptr,
|
build/torch25-cxx98-cu118-x86_64-linux/moe/layers.py
CHANGED
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
|
|
36 |
_fix_llama4_experts(hidden_states, self.experts)
|
37 |
|
38 |
router_logits = self.router(hidden_states)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
out = fused_moe(
|
40 |
hidden_states,
|
41 |
w1=self.experts.gate_up_proj,
|
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
|
|
45 |
renormalize=False,
|
46 |
custom_routing_function=_llama4_topk,
|
47 |
apply_router_weight_on_input=True,
|
|
|
|
|
48 |
)
|
49 |
|
50 |
out += self.shared_expert(hidden_states)
|
|
|
36 |
_fix_llama4_experts(hidden_states, self.experts)
|
37 |
|
38 |
router_logits = self.router(hidden_states)
|
39 |
+
|
40 |
+
extra_kwargs = {}
|
41 |
+
use_fp8_w8a8 = False
|
42 |
+
if hasattr(self.experts, "gate_up_proj_scale"):
|
43 |
+
use_fp8_w8a8 = True
|
44 |
+
extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
|
45 |
+
extra_kwargs["w2_scale"] = self.experts.down_proj_scale
|
46 |
+
|
47 |
out = fused_moe(
|
48 |
hidden_states,
|
49 |
w1=self.experts.gate_up_proj,
|
|
|
53 |
renormalize=False,
|
54 |
custom_routing_function=_llama4_topk,
|
55 |
apply_router_weight_on_input=True,
|
56 |
+
use_fp8_w8a8=use_fp8_w8a8,
|
57 |
+
**extra_kwargs
|
58 |
)
|
59 |
|
60 |
out += self.shared_expert(hidden_states)
|
build/torch25-cxx98-cu121-x86_64-linux/moe/{_moe_21a4db0.abi3.so β _moe_2218ad7.abi3.so}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 87243240
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d2d4157287a3e7979780f23a709eba01e787186bc32a5e56c0620b5429e9cfd3
|
3 |
size 87243240
|
build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import torch
|
2 |
-
from . import
|
3 |
-
ops = torch.ops.
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
-
return f"
|
|
|
1 |
import torch
|
2 |
+
from . import _moe_2218ad7
|
3 |
+
ops = torch.ops._moe_2218ad7
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
+
return f"_moe_2218ad7::{op_name}"
|
build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py
CHANGED
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
|
|
27 |
)
|
28 |
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
@triton.jit
|
31 |
def write_zeros_to_output(
|
32 |
c_ptr,
|
|
|
27 |
)
|
28 |
|
29 |
|
30 |
+
def cdiv(a: int, b: int) -> int:
|
31 |
+
"""Ceiling division."""
|
32 |
+
return -(a // -b)
|
33 |
+
|
34 |
+
|
35 |
+
def _fp8_quantize(
|
36 |
+
A: torch.Tensor,
|
37 |
+
A_scale: Optional[torch.Tensor],
|
38 |
+
block_shape: Optional[List[int]],
|
39 |
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
40 |
+
"""
|
41 |
+
Perform fp8 quantization on the inputs. If a block_shape
|
42 |
+
is provided, the output will be blocked.
|
43 |
+
"""
|
44 |
+
if block_shape is None:
|
45 |
+
A, A_scale = scaled_fp8_quant(A, A_scale)
|
46 |
+
else:
|
47 |
+
assert len(block_shape) == 2
|
48 |
+
_, block_k = block_shape[0], block_shape[1]
|
49 |
+
A, A_scale = per_token_group_quant_fp8(A, block_k)
|
50 |
+
assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
|
51 |
+
return A, A_scale
|
52 |
+
|
53 |
+
|
54 |
@triton.jit
|
55 |
def write_zeros_to_output(
|
56 |
c_ptr,
|
build/torch25-cxx98-cu121-x86_64-linux/moe/layers.py
CHANGED
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
|
|
36 |
_fix_llama4_experts(hidden_states, self.experts)
|
37 |
|
38 |
router_logits = self.router(hidden_states)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
out = fused_moe(
|
40 |
hidden_states,
|
41 |
w1=self.experts.gate_up_proj,
|
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
|
|
45 |
renormalize=False,
|
46 |
custom_routing_function=_llama4_topk,
|
47 |
apply_router_weight_on_input=True,
|
|
|
|
|
48 |
)
|
49 |
|
50 |
out += self.shared_expert(hidden_states)
|
|
|
36 |
_fix_llama4_experts(hidden_states, self.experts)
|
37 |
|
38 |
router_logits = self.router(hidden_states)
|
39 |
+
|
40 |
+
extra_kwargs = {}
|
41 |
+
use_fp8_w8a8 = False
|
42 |
+
if hasattr(self.experts, "gate_up_proj_scale"):
|
43 |
+
use_fp8_w8a8 = True
|
44 |
+
extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
|
45 |
+
extra_kwargs["w2_scale"] = self.experts.down_proj_scale
|
46 |
+
|
47 |
out = fused_moe(
|
48 |
hidden_states,
|
49 |
w1=self.experts.gate_up_proj,
|
|
|
53 |
renormalize=False,
|
54 |
custom_routing_function=_llama4_topk,
|
55 |
apply_router_weight_on_input=True,
|
56 |
+
use_fp8_w8a8=use_fp8_w8a8,
|
57 |
+
**extra_kwargs
|
58 |
)
|
59 |
|
60 |
out += self.shared_expert(hidden_states)
|
build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:820b62662956741ae78d7c51fb9fc978ff2e86c7dc1efa1335b0701e0e28749a
|
3 |
-
size 86957976
|
|
|
|
|
|
|
|
build/{torch25-cxx98-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so β torch25-cxx98-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:122544181246b179a772eb07c9e01c8df6b3025c20b333c566d0e84bfd7bea2d
|
3 |
+
size 86953880
|
build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import torch
|
2 |
-
from . import
|
3 |
-
ops = torch.ops.
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
-
return f"
|
|
|
1 |
import torch
|
2 |
+
from . import _moe_2218ad7
|
3 |
+
ops = torch.ops._moe_2218ad7
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
+
return f"_moe_2218ad7::{op_name}"
|
build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py
CHANGED
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
|
|
27 |
)
|
28 |
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
@triton.jit
|
31 |
def write_zeros_to_output(
|
32 |
c_ptr,
|
|
|
27 |
)
|
28 |
|
29 |
|
30 |
+
def cdiv(a: int, b: int) -> int:
|
31 |
+
"""Ceiling division."""
|
32 |
+
return -(a // -b)
|
33 |
+
|
34 |
+
|
35 |
+
def _fp8_quantize(
|
36 |
+
A: torch.Tensor,
|
37 |
+
A_scale: Optional[torch.Tensor],
|
38 |
+
block_shape: Optional[List[int]],
|
39 |
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
40 |
+
"""
|
41 |
+
Perform fp8 quantization on the inputs. If a block_shape
|
42 |
+
is provided, the output will be blocked.
|
43 |
+
"""
|
44 |
+
if block_shape is None:
|
45 |
+
A, A_scale = scaled_fp8_quant(A, A_scale)
|
46 |
+
else:
|
47 |
+
assert len(block_shape) == 2
|
48 |
+
_, block_k = block_shape[0], block_shape[1]
|
49 |
+
A, A_scale = per_token_group_quant_fp8(A, block_k)
|
50 |
+
assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
|
51 |
+
return A, A_scale
|
52 |
+
|
53 |
+
|
54 |
@triton.jit
|
55 |
def write_zeros_to_output(
|
56 |
c_ptr,
|
build/torch25-cxx98-cu124-x86_64-linux/moe/layers.py
CHANGED
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
|
|
36 |
_fix_llama4_experts(hidden_states, self.experts)
|
37 |
|
38 |
router_logits = self.router(hidden_states)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
out = fused_moe(
|
40 |
hidden_states,
|
41 |
w1=self.experts.gate_up_proj,
|
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
|
|
45 |
renormalize=False,
|
46 |
custom_routing_function=_llama4_topk,
|
47 |
apply_router_weight_on_input=True,
|
|
|
|
|
48 |
)
|
49 |
|
50 |
out += self.shared_expert(hidden_states)
|
|
|
36 |
_fix_llama4_experts(hidden_states, self.experts)
|
37 |
|
38 |
router_logits = self.router(hidden_states)
|
39 |
+
|
40 |
+
extra_kwargs = {}
|
41 |
+
use_fp8_w8a8 = False
|
42 |
+
if hasattr(self.experts, "gate_up_proj_scale"):
|
43 |
+
use_fp8_w8a8 = True
|
44 |
+
extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
|
45 |
+
extra_kwargs["w2_scale"] = self.experts.down_proj_scale
|
46 |
+
|
47 |
out = fused_moe(
|
48 |
hidden_states,
|
49 |
w1=self.experts.gate_up_proj,
|
|
|
53 |
renormalize=False,
|
54 |
custom_routing_function=_llama4_topk,
|
55 |
apply_router_weight_on_input=True,
|
56 |
+
use_fp8_w8a8=use_fp8_w8a8,
|
57 |
+
**extra_kwargs
|
58 |
)
|
59 |
|
60 |
out += self.shared_expert(hidden_states)
|
build/torch26-cxx11-cu118-x86_64-linux/moe/{_moe_21a4db0.abi3.so β _moe_2218ad7.abi3.so}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 87060352
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:49e17eb28438bddf98e314893cf262b807d64ee03850b46abe4d0bf6151f62b6
|
3 |
size 87060352
|
build/torch26-cxx11-cu118-x86_64-linux/moe/_ops.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import torch
|
2 |
-
from . import
|
3 |
-
ops = torch.ops.
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
-
return f"
|
|
|
1 |
import torch
|
2 |
+
from . import _moe_2218ad7
|
3 |
+
ops = torch.ops._moe_2218ad7
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
+
return f"_moe_2218ad7::{op_name}"
|
build/torch26-cxx11-cu118-x86_64-linux/moe/fused_moe.py
CHANGED
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
|
|
27 |
)
|
28 |
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
@triton.jit
|
31 |
def write_zeros_to_output(
|
32 |
c_ptr,
|
|
|
27 |
)
|
28 |
|
29 |
|
30 |
+
def cdiv(a: int, b: int) -> int:
|
31 |
+
"""Ceiling division."""
|
32 |
+
return -(a // -b)
|
33 |
+
|
34 |
+
|
35 |
+
def _fp8_quantize(
|
36 |
+
A: torch.Tensor,
|
37 |
+
A_scale: Optional[torch.Tensor],
|
38 |
+
block_shape: Optional[List[int]],
|
39 |
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
40 |
+
"""
|
41 |
+
Perform fp8 quantization on the inputs. If a block_shape
|
42 |
+
is provided, the output will be blocked.
|
43 |
+
"""
|
44 |
+
if block_shape is None:
|
45 |
+
A, A_scale = scaled_fp8_quant(A, A_scale)
|
46 |
+
else:
|
47 |
+
assert len(block_shape) == 2
|
48 |
+
_, block_k = block_shape[0], block_shape[1]
|
49 |
+
A, A_scale = per_token_group_quant_fp8(A, block_k)
|
50 |
+
assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
|
51 |
+
return A, A_scale
|
52 |
+
|
53 |
+
|
54 |
@triton.jit
|
55 |
def write_zeros_to_output(
|
56 |
c_ptr,
|
build/torch26-cxx11-cu118-x86_64-linux/moe/layers.py
CHANGED
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
|
|
36 |
_fix_llama4_experts(hidden_states, self.experts)
|
37 |
|
38 |
router_logits = self.router(hidden_states)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
out = fused_moe(
|
40 |
hidden_states,
|
41 |
w1=self.experts.gate_up_proj,
|
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
|
|
45 |
renormalize=False,
|
46 |
custom_routing_function=_llama4_topk,
|
47 |
apply_router_weight_on_input=True,
|
|
|
|
|
48 |
)
|
49 |
|
50 |
out += self.shared_expert(hidden_states)
|
|
|
36 |
_fix_llama4_experts(hidden_states, self.experts)
|
37 |
|
38 |
router_logits = self.router(hidden_states)
|
39 |
+
|
40 |
+
extra_kwargs = {}
|
41 |
+
use_fp8_w8a8 = False
|
42 |
+
if hasattr(self.experts, "gate_up_proj_scale"):
|
43 |
+
use_fp8_w8a8 = True
|
44 |
+
extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
|
45 |
+
extra_kwargs["w2_scale"] = self.experts.down_proj_scale
|
46 |
+
|
47 |
out = fused_moe(
|
48 |
hidden_states,
|
49 |
w1=self.experts.gate_up_proj,
|
|
|
53 |
renormalize=False,
|
54 |
custom_routing_function=_llama4_topk,
|
55 |
apply_router_weight_on_input=True,
|
56 |
+
use_fp8_w8a8=use_fp8_w8a8,
|
57 |
+
**extra_kwargs
|
58 |
)
|
59 |
|
60 |
out += self.shared_expert(hidden_states)
|
build/torch26-cxx11-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:9a9acc9198a56410e1d6bddec3a4529fb14b12843f6589b4477bc4ee795f7278
|
3 |
-
size 86961568
|
|
|
|
|
|
|
|
build/torch26-cxx11-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f804164f561c9b46f3b997a6d13552ca4d704c43484b5cd8d14682b4450ed472
|
3 |
+
size 86965664
|
build/torch26-cxx11-cu124-x86_64-linux/moe/_ops.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import torch
|
2 |
-
from . import
|
3 |
-
ops = torch.ops.
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
-
return f"
|
|
|
1 |
import torch
|
2 |
+
from . import _moe_2218ad7
|
3 |
+
ops = torch.ops._moe_2218ad7
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
+
return f"_moe_2218ad7::{op_name}"
|
build/torch26-cxx11-cu124-x86_64-linux/moe/fused_moe.py
CHANGED
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
|
|
27 |
)
|
28 |
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
@triton.jit
|
31 |
def write_zeros_to_output(
|
32 |
c_ptr,
|
|
|
27 |
)
|
28 |
|
29 |
|
30 |
+
def cdiv(a: int, b: int) -> int:
|
31 |
+
"""Ceiling division."""
|
32 |
+
return -(a // -b)
|
33 |
+
|
34 |
+
|
35 |
+
def _fp8_quantize(
|
36 |
+
A: torch.Tensor,
|
37 |
+
A_scale: Optional[torch.Tensor],
|
38 |
+
block_shape: Optional[List[int]],
|
39 |
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
40 |
+
"""
|
41 |
+
Perform fp8 quantization on the inputs. If a block_shape
|
42 |
+
is provided, the output will be blocked.
|
43 |
+
"""
|
44 |
+
if block_shape is None:
|
45 |
+
A, A_scale = scaled_fp8_quant(A, A_scale)
|
46 |
+
else:
|
47 |
+
assert len(block_shape) == 2
|
48 |
+
_, block_k = block_shape[0], block_shape[1]
|
49 |
+
A, A_scale = per_token_group_quant_fp8(A, block_k)
|
50 |
+
assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
|
51 |
+
return A, A_scale
|
52 |
+
|
53 |
+
|
54 |
@triton.jit
|
55 |
def write_zeros_to_output(
|
56 |
c_ptr,
|
build/torch26-cxx11-cu124-x86_64-linux/moe/layers.py
CHANGED
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
|
|
36 |
_fix_llama4_experts(hidden_states, self.experts)
|
37 |
|
38 |
router_logits = self.router(hidden_states)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
out = fused_moe(
|
40 |
hidden_states,
|
41 |
w1=self.experts.gate_up_proj,
|
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
|
|
45 |
renormalize=False,
|
46 |
custom_routing_function=_llama4_topk,
|
47 |
apply_router_weight_on_input=True,
|
|
|
|
|
48 |
)
|
49 |
|
50 |
out += self.shared_expert(hidden_states)
|
|
|
36 |
_fix_llama4_experts(hidden_states, self.experts)
|
37 |
|
38 |
router_logits = self.router(hidden_states)
|
39 |
+
|
40 |
+
extra_kwargs = {}
|
41 |
+
use_fp8_w8a8 = False
|
42 |
+
if hasattr(self.experts, "gate_up_proj_scale"):
|
43 |
+
use_fp8_w8a8 = True
|
44 |
+
extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
|
45 |
+
extra_kwargs["w2_scale"] = self.experts.down_proj_scale
|
46 |
+
|
47 |
out = fused_moe(
|
48 |
hidden_states,
|
49 |
w1=self.experts.gate_up_proj,
|
|
|
53 |
renormalize=False,
|
54 |
custom_routing_function=_llama4_topk,
|
55 |
apply_router_weight_on_input=True,
|
56 |
+
use_fp8_w8a8=use_fp8_w8a8,
|
57 |
+
**extra_kwargs
|
58 |
)
|
59 |
|
60 |
out += self.shared_expert(hidden_states)
|
build/torch26-cxx11-cu126-x86_64-linux/moe/_moe_21a4db0.abi3.so
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:eb26fad3cfe2db1cc88637e020d6d8ddbc54df3e7e8edd64ba9370cd96177587
|
3 |
-
size 87428864
|
|
|
|
|
|
|
|
build/torch26-cxx11-cu126-x86_64-linux/moe/_moe_2218ad7.abi3.so
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1102bf615b2d2f7c320ac73eed63b982e969683ac72c958080dddb87166fa595
|
3 |
+
size 87432960
|
build/torch26-cxx11-cu126-x86_64-linux/moe/_ops.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import torch
|
2 |
-
from . import
|
3 |
-
ops = torch.ops.
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
-
return f"
|
|
|
1 |
import torch
|
2 |
+
from . import _moe_2218ad7
|
3 |
+
ops = torch.ops._moe_2218ad7
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
+
return f"_moe_2218ad7::{op_name}"
|
build/torch26-cxx11-cu126-x86_64-linux/moe/fused_moe.py
CHANGED
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
|
|
27 |
)
|
28 |
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
@triton.jit
|
31 |
def write_zeros_to_output(
|
32 |
c_ptr,
|
|
|
27 |
)
|
28 |
|
29 |
|
30 |
+
def cdiv(a: int, b: int) -> int:
|
31 |
+
"""Ceiling division."""
|
32 |
+
return -(a // -b)
|
33 |
+
|
34 |
+
|
35 |
+
def _fp8_quantize(
|
36 |
+
A: torch.Tensor,
|
37 |
+
A_scale: Optional[torch.Tensor],
|
38 |
+
block_shape: Optional[List[int]],
|
39 |
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
40 |
+
"""
|
41 |
+
Perform fp8 quantization on the inputs. If a block_shape
|
42 |
+
is provided, the output will be blocked.
|
43 |
+
"""
|
44 |
+
if block_shape is None:
|
45 |
+
A, A_scale = scaled_fp8_quant(A, A_scale)
|
46 |
+
else:
|
47 |
+
assert len(block_shape) == 2
|
48 |
+
_, block_k = block_shape[0], block_shape[1]
|
49 |
+
A, A_scale = per_token_group_quant_fp8(A, block_k)
|
50 |
+
assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
|
51 |
+
return A, A_scale
|
52 |
+
|
53 |
+
|
54 |
@triton.jit
|
55 |
def write_zeros_to_output(
|
56 |
c_ptr,
|
build/torch26-cxx11-cu126-x86_64-linux/moe/layers.py
CHANGED
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
|
|
36 |
_fix_llama4_experts(hidden_states, self.experts)
|
37 |
|
38 |
router_logits = self.router(hidden_states)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
out = fused_moe(
|
40 |
hidden_states,
|
41 |
w1=self.experts.gate_up_proj,
|
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
|
|
45 |
renormalize=False,
|
46 |
custom_routing_function=_llama4_topk,
|
47 |
apply_router_weight_on_input=True,
|
|
|
|
|
48 |
)
|
49 |
|
50 |
out += self.shared_expert(hidden_states)
|
|
|
36 |
_fix_llama4_experts(hidden_states, self.experts)
|
37 |
|
38 |
router_logits = self.router(hidden_states)
|
39 |
+
|
40 |
+
extra_kwargs = {}
|
41 |
+
use_fp8_w8a8 = False
|
42 |
+
if hasattr(self.experts, "gate_up_proj_scale"):
|
43 |
+
use_fp8_w8a8 = True
|
44 |
+
extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
|
45 |
+
extra_kwargs["w2_scale"] = self.experts.down_proj_scale
|
46 |
+
|
47 |
out = fused_moe(
|
48 |
hidden_states,
|
49 |
w1=self.experts.gate_up_proj,
|
|
|
53 |
renormalize=False,
|
54 |
custom_routing_function=_llama4_topk,
|
55 |
apply_router_weight_on_input=True,
|
56 |
+
use_fp8_w8a8=use_fp8_w8a8,
|
57 |
+
**extra_kwargs
|
58 |
)
|
59 |
|
60 |
out += self.shared_expert(hidden_states)
|
build/torch26-cxx98-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9e739bb546d3d1730fa7696fbd767fd588286dec369f1b7551edd1ec481df96f
|
3 |
+
size 87044288
|
build/torch26-cxx98-cu118-x86_64-linux/moe/_ops.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import torch
|
2 |
-
from . import
|
3 |
-
ops = torch.ops.
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
-
return f"
|
|
|
1 |
import torch
|
2 |
+
from . import _moe_2218ad7
|
3 |
+
ops = torch.ops._moe_2218ad7
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
+
return f"_moe_2218ad7::{op_name}"
|
build/torch26-cxx98-cu118-x86_64-linux/moe/fused_moe.py
CHANGED
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
|
|
27 |
)
|
28 |
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
@triton.jit
|
31 |
def write_zeros_to_output(
|
32 |
c_ptr,
|
|
|
27 |
)
|
28 |
|
29 |
|
30 |
+
def cdiv(a: int, b: int) -> int:
|
31 |
+
"""Ceiling division."""
|
32 |
+
return -(a // -b)
|
33 |
+
|
34 |
+
|
35 |
+
def _fp8_quantize(
|
36 |
+
A: torch.Tensor,
|
37 |
+
A_scale: Optional[torch.Tensor],
|
38 |
+
block_shape: Optional[List[int]],
|
39 |
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
40 |
+
"""
|
41 |
+
Perform fp8 quantization on the inputs. If a block_shape
|
42 |
+
is provided, the output will be blocked.
|
43 |
+
"""
|
44 |
+
if block_shape is None:
|
45 |
+
A, A_scale = scaled_fp8_quant(A, A_scale)
|
46 |
+
else:
|
47 |
+
assert len(block_shape) == 2
|
48 |
+
_, block_k = block_shape[0], block_shape[1]
|
49 |
+
A, A_scale = per_token_group_quant_fp8(A, block_k)
|
50 |
+
assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
|
51 |
+
return A, A_scale
|
52 |
+
|
53 |
+
|
54 |
@triton.jit
|
55 |
def write_zeros_to_output(
|
56 |
c_ptr,
|
build/torch26-cxx98-cu118-x86_64-linux/moe/layers.py
CHANGED
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
|
|
36 |
_fix_llama4_experts(hidden_states, self.experts)
|
37 |
|
38 |
router_logits = self.router(hidden_states)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
out = fused_moe(
|
40 |
hidden_states,
|
41 |
w1=self.experts.gate_up_proj,
|
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
|
|
45 |
renormalize=False,
|
46 |
custom_routing_function=_llama4_topk,
|
47 |
apply_router_weight_on_input=True,
|
|
|
|
|
48 |
)
|
49 |
|
50 |
out += self.shared_expert(hidden_states)
|
|
|
36 |
_fix_llama4_experts(hidden_states, self.experts)
|
37 |
|
38 |
router_logits = self.router(hidden_states)
|
39 |
+
|
40 |
+
extra_kwargs = {}
|
41 |
+
use_fp8_w8a8 = False
|
42 |
+
if hasattr(self.experts, "gate_up_proj_scale"):
|
43 |
+
use_fp8_w8a8 = True
|
44 |
+
extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
|
45 |
+
extra_kwargs["w2_scale"] = self.experts.down_proj_scale
|
46 |
+
|
47 |
out = fused_moe(
|
48 |
hidden_states,
|
49 |
w1=self.experts.gate_up_proj,
|
|
|
53 |
renormalize=False,
|
54 |
custom_routing_function=_llama4_topk,
|
55 |
apply_router_weight_on_input=True,
|
56 |
+
use_fp8_w8a8=use_fp8_w8a8,
|
57 |
+
**extra_kwargs
|
58 |
)
|
59 |
|
60 |
out += self.shared_expert(hidden_states)
|
build/torch26-cxx98-cu124-x86_64-linux/moe/{_moe_21a4db0.abi3.so β _moe_2218ad7.abi3.so}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 86953856
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bcb950d2e7196ad22cad926749b7e0e06e5454f0a732755b72f0b8dd456529c6
|
3 |
size 86953856
|
build/torch26-cxx98-cu124-x86_64-linux/moe/_ops.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import torch
|
2 |
-
from . import
|
3 |
-
ops = torch.ops.
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
-
return f"
|
|
|
1 |
import torch
|
2 |
+
from . import _moe_2218ad7
|
3 |
+
ops = torch.ops._moe_2218ad7
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
+
return f"_moe_2218ad7::{op_name}"
|
build/torch26-cxx98-cu124-x86_64-linux/moe/fused_moe.py
CHANGED
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
|
|
27 |
)
|
28 |
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
@triton.jit
|
31 |
def write_zeros_to_output(
|
32 |
c_ptr,
|
|
|
27 |
)
|
28 |
|
29 |
|
30 |
+
def cdiv(a: int, b: int) -> int:
|
31 |
+
"""Ceiling division."""
|
32 |
+
return -(a // -b)
|
33 |
+
|
34 |
+
|
35 |
+
def _fp8_quantize(
|
36 |
+
A: torch.Tensor,
|
37 |
+
A_scale: Optional[torch.Tensor],
|
38 |
+
block_shape: Optional[List[int]],
|
39 |
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
40 |
+
"""
|
41 |
+
Perform fp8 quantization on the inputs. If a block_shape
|
42 |
+
is provided, the output will be blocked.
|
43 |
+
"""
|
44 |
+
if block_shape is None:
|
45 |
+
A, A_scale = scaled_fp8_quant(A, A_scale)
|
46 |
+
else:
|
47 |
+
assert len(block_shape) == 2
|
48 |
+
_, block_k = block_shape[0], block_shape[1]
|
49 |
+
A, A_scale = per_token_group_quant_fp8(A, block_k)
|
50 |
+
assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
|
51 |
+
return A, A_scale
|
52 |
+
|
53 |
+
|
54 |
@triton.jit
|
55 |
def write_zeros_to_output(
|
56 |
c_ptr,
|
build/torch26-cxx98-cu124-x86_64-linux/moe/layers.py
CHANGED
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
|
|
36 |
_fix_llama4_experts(hidden_states, self.experts)
|
37 |
|
38 |
router_logits = self.router(hidden_states)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
out = fused_moe(
|
40 |
hidden_states,
|
41 |
w1=self.experts.gate_up_proj,
|
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
|
|
45 |
renormalize=False,
|
46 |
custom_routing_function=_llama4_topk,
|
47 |
apply_router_weight_on_input=True,
|
|
|
|
|
48 |
)
|
49 |
|
50 |
out += self.shared_expert(hidden_states)
|
|
|
36 |
_fix_llama4_experts(hidden_states, self.experts)
|
37 |
|
38 |
router_logits = self.router(hidden_states)
|
39 |
+
|
40 |
+
extra_kwargs = {}
|
41 |
+
use_fp8_w8a8 = False
|
42 |
+
if hasattr(self.experts, "gate_up_proj_scale"):
|
43 |
+
use_fp8_w8a8 = True
|
44 |
+
extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
|
45 |
+
extra_kwargs["w2_scale"] = self.experts.down_proj_scale
|
46 |
+
|
47 |
out = fused_moe(
|
48 |
hidden_states,
|
49 |
w1=self.experts.gate_up_proj,
|
|
|
53 |
renormalize=False,
|
54 |
custom_routing_function=_llama4_topk,
|
55 |
apply_router_weight_on_input=True,
|
56 |
+
use_fp8_w8a8=use_fp8_w8a8,
|
57 |
+
**extra_kwargs
|
58 |
)
|
59 |
|
60 |
out += self.shared_expert(hidden_states)
|
build/torch26-cxx98-cu126-x86_64-linux/moe/{_moe_21a4db0.abi3.so β _moe_2218ad7.abi3.so}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 87417064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fe5c605f1da902aebc1d7ce0355b649fcfcc44aed0023fdc87974f3d56273897
|
3 |
size 87417064
|
build/torch26-cxx98-cu126-x86_64-linux/moe/_ops.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import torch
|
2 |
-
from . import
|
3 |
-
ops = torch.ops.
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
-
return f"
|
|
|
1 |
import torch
|
2 |
+
from . import _moe_2218ad7
|
3 |
+
ops = torch.ops._moe_2218ad7
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
+
return f"_moe_2218ad7::{op_name}"
|
build/torch26-cxx98-cu126-x86_64-linux/moe/fused_moe.py
CHANGED
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
|
|
27 |
)
|
28 |
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
@triton.jit
|
31 |
def write_zeros_to_output(
|
32 |
c_ptr,
|
|
|
27 |
)
|
28 |
|
29 |
|
30 |
+
def cdiv(a: int, b: int) -> int:
|
31 |
+
"""Ceiling division."""
|
32 |
+
return -(a // -b)
|
33 |
+
|
34 |
+
|
35 |
+
def _fp8_quantize(
|
36 |
+
A: torch.Tensor,
|
37 |
+
A_scale: Optional[torch.Tensor],
|
38 |
+
block_shape: Optional[List[int]],
|
39 |
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
40 |
+
"""
|
41 |
+
Perform fp8 quantization on the inputs. If a block_shape
|
42 |
+
is provided, the output will be blocked.
|
43 |
+
"""
|
44 |
+
if block_shape is None:
|
45 |
+
A, A_scale = scaled_fp8_quant(A, A_scale)
|
46 |
+
else:
|
47 |
+
assert len(block_shape) == 2
|
48 |
+
_, block_k = block_shape[0], block_shape[1]
|
49 |
+
A, A_scale = per_token_group_quant_fp8(A, block_k)
|
50 |
+
assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
|
51 |
+
return A, A_scale
|
52 |
+
|
53 |
+
|
54 |
@triton.jit
|
55 |
def write_zeros_to_output(
|
56 |
c_ptr,
|