Mohamed Mekkouri commited on
Commit
71671c1
·
1 Parent(s): 4db85a7

add builds

Browse files
build/torch28-metal-aarch64-darwin/gpt_oss_metal_kernels/__init__.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ._ops import ops
2
+ import torch
3
+
4
+ def f32_bf16w_matmul(input: torch.Tensor,
5
+ weight_bf16: torch.Tensor,
6
+ bias_bf16: torch.Tensor,
7
+ output: torch.Tensor,
8
+ num_tokens: int,
9
+ num_cols: int,
10
+ num_rows: int,
11
+ threadgroup_size: int) -> torch.Tensor:
12
+ ops.f32_bf16w_matmul(input, weight_bf16, bias_bf16, output,
13
+ num_tokens, num_cols, num_rows, threadgroup_size)
14
+ return output
15
+
16
+ def bf16_f32_embeddings(token_ids: torch.Tensor,
17
+ weight_bf16: torch.Tensor,
18
+ output: torch.Tensor,
19
+ threadgroup_size: int) -> torch.Tensor:
20
+ ops.bf16_f32_embeddings(token_ids, weight_bf16, output, threadgroup_size)
21
+ return output
22
+
23
+ def f32_bf16w_rmsnorm(input: torch.Tensor,
24
+ weight_bf16: torch.Tensor,
25
+ output: torch.Tensor,
26
+ epsilon: float) -> torch.Tensor:
27
+ ops.f32_bf16w_rmsnorm(input, weight_bf16, output, epsilon)
28
+ return output
29
+
30
+ def f32_bf16w_dense_matmul_qkv(input: torch.Tensor,
31
+ weight_bf16: torch.Tensor,
32
+ bias_bf16: torch.Tensor,
33
+ output: torch.Tensor) -> torch.Tensor:
34
+ ops.f32_bf16w_dense_matmul_qkv(input, weight_bf16, bias_bf16, output)
35
+ return output
36
+
37
+ def f32_bf16w_dense_matmul_attn_output(input: torch.Tensor,
38
+ weight_bf16: torch.Tensor,
39
+ bias_bf16: torch.Tensor,
40
+ output: torch.Tensor) -> torch.Tensor:
41
+ ops.f32_bf16w_dense_matmul_attn_output(input, weight_bf16, bias_bf16, output)
42
+ return output
43
+
44
+ def f32_bf16w_dense_matmul_mlp_gate(input: torch.Tensor,
45
+ weight_bf16: torch.Tensor,
46
+ bias_bf16: torch.Tensor,
47
+ output: torch.Tensor) -> torch.Tensor:
48
+ ops.f32_bf16w_dense_matmul_mlp_gate(input, weight_bf16, bias_bf16, output)
49
+ return output
50
+
51
+ def f32_rope(activations: torch.Tensor,
52
+ rope_base: float,
53
+ interpolation_scale: float,
54
+ yarn_offset: float,
55
+ yarn_scale: float,
56
+ yarn_multiplier: float,
57
+ num_tokens: int,
58
+ num_q_heads: int,
59
+ num_kv_heads: int,
60
+ attn_head_dim: int,
61
+ token_offset: int,
62
+ threadgroup_size: int) -> torch.Tensor:
63
+ ops.f32_rope(activations, rope_base, interpolation_scale, yarn_offset,
64
+ yarn_scale, yarn_multiplier, num_tokens, num_q_heads,
65
+ num_kv_heads, attn_head_dim, token_offset, threadgroup_size)
66
+ return activations
67
+
68
+ def f32_bf16w_matmul_qkv(input: torch.Tensor,
69
+ weight_bf16: torch.Tensor,
70
+ bias_bf16: torch.Tensor,
71
+ output: torch.Tensor,
72
+ kv_cache: torch.Tensor,
73
+ kv_cache_offset_bytes: int,
74
+ num_tokens: int,
75
+ num_cols: int,
76
+ num_q_heads: int,
77
+ num_kv_heads: int,
78
+ attn_head_dim: int,
79
+ token_offset: int,
80
+ max_tokens: int,
81
+ rope_base: float,
82
+ interpolation_scale: float,
83
+ yarn_offset: float,
84
+ yarn_scale: float,
85
+ yarn_multiplier: float,
86
+ threadgroup_size: int) -> torch.Tensor:
87
+ ops.f32_bf16w_matmul_qkv(input, weight_bf16, bias_bf16, output, kv_cache,
88
+ kv_cache_offset_bytes, num_tokens, num_cols,
89
+ num_q_heads, num_kv_heads, attn_head_dim,
90
+ token_offset, max_tokens, rope_base,
91
+ interpolation_scale, yarn_offset, yarn_scale,
92
+ yarn_multiplier, threadgroup_size)
93
+ return output
94
+
95
+ def f32_sdpa(q: torch.Tensor,
96
+ q_offset_bytes: int,
97
+ kv: torch.Tensor,
98
+ kv_offset_bytes: int,
99
+ s_bf16: torch.Tensor,
100
+ s_offset_bytes: int,
101
+ output: torch.Tensor,
102
+ output_offset_bytes: int,
103
+ window: int,
104
+ kv_stride: int,
105
+ num_q_tokens: int,
106
+ num_kv_tokens: int,
107
+ num_q_heads: int,
108
+ num_kv_heads: int,
109
+ head_dim: int) -> torch.Tensor:
110
+ ops.f32_sdpa(q, q_offset_bytes, kv, kv_offset_bytes, s_bf16, s_offset_bytes,
111
+ output, output_offset_bytes, window, kv_stride,
112
+ num_q_tokens, num_kv_tokens, num_q_heads, num_kv_heads, head_dim)
113
+ return output
114
+
115
+ def f32_topk(scores: torch.Tensor,
116
+ expert_ids: torch.Tensor,
117
+ expert_scores: torch.Tensor,
118
+ num_tokens: int,
119
+ num_experts: int,
120
+ num_active_experts: int) -> None:
121
+ ops.f32_topk(scores, expert_ids, expert_scores,
122
+ num_tokens, num_experts, num_active_experts)
123
+
124
+ def expert_routing_metadata(expert_ids: torch.Tensor,
125
+ expert_scores: torch.Tensor,
126
+ expert_offsets: torch.Tensor,
127
+ intra_expert_offsets: torch.Tensor,
128
+ num_tokens: int,
129
+ num_experts: int) -> None:
130
+ ops.expert_routing_metadata(expert_ids, expert_scores,
131
+ expert_offsets, intra_expert_offsets,
132
+ num_tokens, num_experts)
133
+
134
+ def f32_scatter(input: torch.Tensor,
135
+ expert_ids: torch.Tensor,
136
+ expert_scores: torch.Tensor,
137
+ expert_offsets: torch.Tensor,
138
+ intra_expert_offsets: torch.Tensor,
139
+ output: torch.Tensor,
140
+ num_channels: int,
141
+ num_tokens: int,
142
+ num_active_experts: int) -> torch.Tensor:
143
+ ops.f32_scatter(input, expert_ids, expert_scores,
144
+ expert_offsets, intra_expert_offsets,
145
+ output, num_channels, num_tokens, num_active_experts)
146
+ return output
147
+
148
+ def f32_bf16w_matmul_add(input: torch.Tensor,
149
+ weight_bf16: torch.Tensor,
150
+ bias_bf16: torch.Tensor,
151
+ output: torch.Tensor,
152
+ num_tokens: int,
153
+ num_cols: int,
154
+ num_rows: int,
155
+ threadgroup_size: int) -> torch.Tensor:
156
+ ops.f32_bf16w_matmul_add(input, weight_bf16, bias_bf16, output,
157
+ num_tokens, num_cols, num_rows, threadgroup_size)
158
+ return output
159
+
160
+ __all__ = [
161
+ "f32_bf16w_matmul",
162
+ "bf16_f32_embeddings",
163
+ "f32_bf16w_rmsnorm",
164
+ "f32_bf16w_dense_matmul_qkv",
165
+ "f32_bf16w_dense_matmul_attn_output",
166
+ "f32_bf16w_dense_matmul_mlp_gate",
167
+ "f32_rope",
168
+ "f32_bf16w_matmul_qkv",
169
+ "f32_sdpa",
170
+ "f32_topk",
171
+ "expert_routing_metadata",
172
+ "f32_scatter",
173
+ "f32_bf16w_matmul_add",
174
+ ]
build/torch28-metal-aarch64-darwin/gpt_oss_metal_kernels/_gpt_oss_metal_kernels_4db85a7.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9c015309337936241252a9bd6db397f272ba77f05cc81294c6868e29fc5502a
3
+ size 425800
build/torch28-metal-aarch64-darwin/gpt_oss_metal_kernels/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _gpt_oss_metal_kernels_4db85a7
3
+ ops = torch.ops._gpt_oss_metal_kernels_4db85a7
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_gpt_oss_metal_kernels_4db85a7::{op_name}"
build/torch29-metal-aarch64-darwin/gpt_oss_metal_kernels/__init__.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ._ops import ops
2
+ import torch
3
+
4
+ def f32_bf16w_matmul(input: torch.Tensor,
5
+ weight_bf16: torch.Tensor,
6
+ bias_bf16: torch.Tensor,
7
+ output: torch.Tensor,
8
+ num_tokens: int,
9
+ num_cols: int,
10
+ num_rows: int,
11
+ threadgroup_size: int) -> torch.Tensor:
12
+ ops.f32_bf16w_matmul(input, weight_bf16, bias_bf16, output,
13
+ num_tokens, num_cols, num_rows, threadgroup_size)
14
+ return output
15
+
16
+ def bf16_f32_embeddings(token_ids: torch.Tensor,
17
+ weight_bf16: torch.Tensor,
18
+ output: torch.Tensor,
19
+ threadgroup_size: int) -> torch.Tensor:
20
+ ops.bf16_f32_embeddings(token_ids, weight_bf16, output, threadgroup_size)
21
+ return output
22
+
23
+ def f32_bf16w_rmsnorm(input: torch.Tensor,
24
+ weight_bf16: torch.Tensor,
25
+ output: torch.Tensor,
26
+ epsilon: float) -> torch.Tensor:
27
+ ops.f32_bf16w_rmsnorm(input, weight_bf16, output, epsilon)
28
+ return output
29
+
30
+ def f32_bf16w_dense_matmul_qkv(input: torch.Tensor,
31
+ weight_bf16: torch.Tensor,
32
+ bias_bf16: torch.Tensor,
33
+ output: torch.Tensor) -> torch.Tensor:
34
+ ops.f32_bf16w_dense_matmul_qkv(input, weight_bf16, bias_bf16, output)
35
+ return output
36
+
37
+ def f32_bf16w_dense_matmul_attn_output(input: torch.Tensor,
38
+ weight_bf16: torch.Tensor,
39
+ bias_bf16: torch.Tensor,
40
+ output: torch.Tensor) -> torch.Tensor:
41
+ ops.f32_bf16w_dense_matmul_attn_output(input, weight_bf16, bias_bf16, output)
42
+ return output
43
+
44
+ def f32_bf16w_dense_matmul_mlp_gate(input: torch.Tensor,
45
+ weight_bf16: torch.Tensor,
46
+ bias_bf16: torch.Tensor,
47
+ output: torch.Tensor) -> torch.Tensor:
48
+ ops.f32_bf16w_dense_matmul_mlp_gate(input, weight_bf16, bias_bf16, output)
49
+ return output
50
+
51
+ def f32_rope(activations: torch.Tensor,
52
+ rope_base: float,
53
+ interpolation_scale: float,
54
+ yarn_offset: float,
55
+ yarn_scale: float,
56
+ yarn_multiplier: float,
57
+ num_tokens: int,
58
+ num_q_heads: int,
59
+ num_kv_heads: int,
60
+ attn_head_dim: int,
61
+ token_offset: int,
62
+ threadgroup_size: int) -> torch.Tensor:
63
+ ops.f32_rope(activations, rope_base, interpolation_scale, yarn_offset,
64
+ yarn_scale, yarn_multiplier, num_tokens, num_q_heads,
65
+ num_kv_heads, attn_head_dim, token_offset, threadgroup_size)
66
+ return activations
67
+
68
+ def f32_bf16w_matmul_qkv(input: torch.Tensor,
69
+ weight_bf16: torch.Tensor,
70
+ bias_bf16: torch.Tensor,
71
+ output: torch.Tensor,
72
+ kv_cache: torch.Tensor,
73
+ kv_cache_offset_bytes: int,
74
+ num_tokens: int,
75
+ num_cols: int,
76
+ num_q_heads: int,
77
+ num_kv_heads: int,
78
+ attn_head_dim: int,
79
+ token_offset: int,
80
+ max_tokens: int,
81
+ rope_base: float,
82
+ interpolation_scale: float,
83
+ yarn_offset: float,
84
+ yarn_scale: float,
85
+ yarn_multiplier: float,
86
+ threadgroup_size: int) -> torch.Tensor:
87
+ ops.f32_bf16w_matmul_qkv(input, weight_bf16, bias_bf16, output, kv_cache,
88
+ kv_cache_offset_bytes, num_tokens, num_cols,
89
+ num_q_heads, num_kv_heads, attn_head_dim,
90
+ token_offset, max_tokens, rope_base,
91
+ interpolation_scale, yarn_offset, yarn_scale,
92
+ yarn_multiplier, threadgroup_size)
93
+ return output
94
+
95
+ def f32_sdpa(q: torch.Tensor,
96
+ q_offset_bytes: int,
97
+ kv: torch.Tensor,
98
+ kv_offset_bytes: int,
99
+ s_bf16: torch.Tensor,
100
+ s_offset_bytes: int,
101
+ output: torch.Tensor,
102
+ output_offset_bytes: int,
103
+ window: int,
104
+ kv_stride: int,
105
+ num_q_tokens: int,
106
+ num_kv_tokens: int,
107
+ num_q_heads: int,
108
+ num_kv_heads: int,
109
+ head_dim: int) -> torch.Tensor:
110
+ ops.f32_sdpa(q, q_offset_bytes, kv, kv_offset_bytes, s_bf16, s_offset_bytes,
111
+ output, output_offset_bytes, window, kv_stride,
112
+ num_q_tokens, num_kv_tokens, num_q_heads, num_kv_heads, head_dim)
113
+ return output
114
+
115
+ def f32_topk(scores: torch.Tensor,
116
+ expert_ids: torch.Tensor,
117
+ expert_scores: torch.Tensor,
118
+ num_tokens: int,
119
+ num_experts: int,
120
+ num_active_experts: int) -> None:
121
+ ops.f32_topk(scores, expert_ids, expert_scores,
122
+ num_tokens, num_experts, num_active_experts)
123
+
124
+ def expert_routing_metadata(expert_ids: torch.Tensor,
125
+ expert_scores: torch.Tensor,
126
+ expert_offsets: torch.Tensor,
127
+ intra_expert_offsets: torch.Tensor,
128
+ num_tokens: int,
129
+ num_experts: int) -> None:
130
+ ops.expert_routing_metadata(expert_ids, expert_scores,
131
+ expert_offsets, intra_expert_offsets,
132
+ num_tokens, num_experts)
133
+
134
+ def f32_scatter(input: torch.Tensor,
135
+ expert_ids: torch.Tensor,
136
+ expert_scores: torch.Tensor,
137
+ expert_offsets: torch.Tensor,
138
+ intra_expert_offsets: torch.Tensor,
139
+ output: torch.Tensor,
140
+ num_channels: int,
141
+ num_tokens: int,
142
+ num_active_experts: int) -> torch.Tensor:
143
+ ops.f32_scatter(input, expert_ids, expert_scores,
144
+ expert_offsets, intra_expert_offsets,
145
+ output, num_channels, num_tokens, num_active_experts)
146
+ return output
147
+
148
+ def f32_bf16w_matmul_add(input: torch.Tensor,
149
+ weight_bf16: torch.Tensor,
150
+ bias_bf16: torch.Tensor,
151
+ output: torch.Tensor,
152
+ num_tokens: int,
153
+ num_cols: int,
154
+ num_rows: int,
155
+ threadgroup_size: int) -> torch.Tensor:
156
+ ops.f32_bf16w_matmul_add(input, weight_bf16, bias_bf16, output,
157
+ num_tokens, num_cols, num_rows, threadgroup_size)
158
+ return output
159
+
160
+ __all__ = [
161
+ "f32_bf16w_matmul",
162
+ "bf16_f32_embeddings",
163
+ "f32_bf16w_rmsnorm",
164
+ "f32_bf16w_dense_matmul_qkv",
165
+ "f32_bf16w_dense_matmul_attn_output",
166
+ "f32_bf16w_dense_matmul_mlp_gate",
167
+ "f32_rope",
168
+ "f32_bf16w_matmul_qkv",
169
+ "f32_sdpa",
170
+ "f32_topk",
171
+ "expert_routing_metadata",
172
+ "f32_scatter",
173
+ "f32_bf16w_matmul_add",
174
+ ]
build/torch29-metal-aarch64-darwin/gpt_oss_metal_kernels/_gpt_oss_metal_kernels_4db85a7.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51bd6eb3c95b6602dd1ddff1137b8fab4db2cb9781c08f96cf12147f7f1c9fa0
3
+ size 410328
build/torch29-metal-aarch64-darwin/gpt_oss_metal_kernels/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _gpt_oss_metal_kernels_4db85a7
3
+ ops = torch.ops._gpt_oss_metal_kernels_4db85a7
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_gpt_oss_metal_kernels_4db85a7::{op_name}"