deepseek-ai
/

DeepSeek-V3.1-Terminus

Text Generation

text-generation-inference

Model card Files Files and versions

GeeeekExplorer commited on 25 days ago

Commit

74b7f11

·

verified ·

1 Parent(s): 1b79fbe

Update inference/kernel.py

Files changed (1) hide show

inference/kernel.py +0 -78

inference/kernel.py CHANGED Viewed

@@ -194,81 +194,3 @@ def fp8_gemm(
     kernel = fp8_gemm_kernel(N, K)
     kernel(a.view(M, K), b, c.view(M, N), a_s.view(M, -1), b_s)
     return c
-@tilelang.jit(out_idx=[4], pass_configs=pass_configs)
-def fp8_index_kernel(h: int, d: int):
-    b = T.symbolic("b")
-    m = T.symbolic("m")
-    n = T.symbolic("n")
-    blk_n1 = 512
-    blk_n2 = 128
-    @T.prim_func
-    def fp8_index_kernel_(
-        q: T.Tensor[(b, m, h, d), FP8],
-        q_s: T.Tensor[(b, m, h), FP32],
-        k: T.Tensor[(b, n, d), FP8],
-        k_s: T.Tensor[(b, n), FP32],
-        o: T.Tensor[(b, m, n), FP32],
-    ) -> None:
-        with T.Kernel(b, m, T.ceildiv(n, blk_n1)) as (i_b, i_m, i1_n):
-            q_smem = T.alloc_shared((h, d), FP8)
-            T.copy(q[i_b, i_m, 0, 0], q_smem)
-            q_s_frag = T.alloc_fragment(h, FP32)
-            T.copy(q_s[i_b, i_m, 0], q_s_frag)
-            for i2_n in T.Pipelined(blk_n1 // blk_n2, num_stages=2):
-                k_smem = T.alloc_shared((blk_n2, d), FP8)
-                T.copy(k[i_b, i1_n * blk_n1 + i2_n * blk_n2, 0], k_smem)
-                k_s_frag = T.alloc_fragment(blk_n2, FP32)
-                T.copy(k_s[i_b, i1_n * blk_n1 + i2_n * blk_n2], k_s_frag)
-                logits = T.alloc_fragment((blk_n2, h), FP32)
-                T.gemm(
-                    k_smem,
-                    q_smem,
-                    logits,
-                    transpose_A=False,
-                    transpose_B=True,
-                    clear_accum=True,
-                )
-                for i_h, i3_n in T.Parallel(h, blk_n2):
-                    logits[i3_n, i_h] = T.max(logits[i3_n, i_h], 0) * q_s_frag[i_h]
-                logits_sum = T.alloc_fragment(blk_n2, FP32)
-                T.reduce_sum(logits, logits_sum, dim=1)
-                for i3_n in T.Parallel(blk_n2):
-                    logits_sum[i3_n] *= k_s_frag[i3_n]
-                T.copy(logits_sum, o[i_b, i_m, i1_n * blk_n1 + i2_n * blk_n2])
-    return fp8_index_kernel_
-def fp8_index(
-    q: torch.Tensor,
-    q_s: torch.Tensor,
-    k: torch.Tensor,
-    k_s: torch.Tensor,
-) -> torch.Tensor:
-    """
-    Perform index score using FP8 precision.
-    Args:
-        q (torch.Tensor): The Q tensor, must be contiguous.
-        q_s (torch.Tensor): The scaling factor for Q (float), must be contiguous.
-        k (torch.Tensor): The K tensor, must be contiguous.
-        k_s (torch.Tensor): The scaling factor for K (e8m0 here), must be contiguous.
-        fp8 q @ fp8 k -> fp32 logits
-        relu(fp32 logits) * q_s (weights) -> fp32 logits
-        fp32 logits -> fp32 logits_sum
-        fp32 logits_sum * k_s (e8m0) -> fp32 index_score
-    """
-    return fp8_index_kernel(q.shape[2], q.shape[3])(q, q_s, k, k_s)

     kernel = fp8_gemm_kernel(N, K)
     kernel(a.view(M, K), b, c.view(M, N), a_s.view(M, -1), b_s)
     return c