File size: 3,128 Bytes
b4cad21
116e562
b4cad21
 
 
5c6fb68
c31b5ce
116e562
 
b4cad21
5c6fb68
b4cad21
 
59b2fef
b4cad21
0da5bf5
b4cad21
 
 
 
 
 
 
 
 
 
 
 
0da5bf5
b4cad21
 
116e562
b4cad21
0da5bf5
b4cad21
 
0da5bf5
 
 
 
 
b4cad21
 
 
 
0da5bf5
5c6fb68
 
85bad96
 
5c6fb68
3c8bb73
 
5c6fb68
 
0da5bf5
 
5c6fb68
 
 
 
 
59b2fef
5c6fb68
 
 
 
 
 
 
 
59b2fef
3c8bb73
5c6fb68
 
 
 
 
 
c31b5ce
 
59b2fef
c31b5ce
 
 
 
 
 
 
 
 
 
165b25c
 
59b2fef
165b25c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
[general]
name = "quantization"

[torch]
src = [
  "core/registration.h",
  "core/scalar_type.hpp",
  "torch-ext/torch_binding.cpp",
  "torch-ext/torch_binding.h"
]
include = [ "." ]

[kernel.cutlass_w8a8]
cuda-capabilities = [ "7.5", "8.0", "8.6", "8.7", "8.9", "9.0", "10.0", "10.1", "12.0" ]
src = [
  "core/math.hpp",
  "cutlass_w8a8/common.hpp",
  "cutlass_w8a8/scaled_mm_c2x.cu",
  "cutlass_w8a8/scaled_mm_c2x.cuh",
  "cutlass_w8a8/scaled_mm_c2x_sm75_dispatch.cuh",
  "cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh",
  "cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh",
  "cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh",
  "cutlass_w8a8/scaled_mm_entry.cu",
  "cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp",
  "cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp",
]
include = [ "." ]
depends = [ "cutlass_3_6", "torch" ]

[kernel.cutlass_w8a8_hopper]
cuda-capabilities = [ "9.0", "9.0a" ]
src = [
  "core/math.hpp",
  "cutlass_w8a8/common.hpp",
  "cutlass_w8a8/scaled_mm_c3x.cu",
  "cutlass_w8a8/scaled_mm_c3x.cuh",
  "cutlass_w8a8/scaled_mm_c3x_sm90_fp8_dispatch.cuh",
  "cutlass_w8a8/scaled_mm_c3x_sm90_int8_dispatch.cuh",
  "cutlass_extensions/common.cpp",
  "cutlass_extensions/common.hpp",
  "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp",
  "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp",
]
include = [ "." ]
depends = [ "cutlass_3_6", "torch" ]

[kernel.fp8_common]
# language = "cuda-hipify"
# rocm-archs = [ "gfx906", "gfx908", "gfx90a", "gfx940", "gfx941", "gfx942", "gfx1030", "gfx1100", "gfx1101" ]
src = [
  "fp8/amd/hip_float8.h",
  "fp8/amd/hip_float8_impl.h",
  "fp8/common.cu",
  "fp8/common.cuh",
  "dispatch_utils.h",
  "vectorization.cuh"
]
include = [ "." ]
depends = [ "torch" ]

[kernel.fp8_marlin]
cuda-capabilities = [ "8.0", "8.6", "8.7", "8.9", "9.0", "10.0", "10.1", "12.0" ]
src = [
  "fp8/fp8_marlin.cu",
  "gptq_marlin/marlin.cuh",
  "gptq_marlin/marlin_dtypes.cuh",
]
depends = [ "torch" ]

[kernel.int8_common]
cuda-capabilities = [ "7.5", "8.0", "8.6", "8.7", "8.9", "9.0", "10.0", "10.1", "12.0" ]
rocm-archs = [ "gfx906", "gfx908", "gfx90a", "gfx940", "gfx941", "gfx942", "gfx1030", "gfx1100", "gfx1101" ]
src = [
  "compressed_tensors/int8_quant_kernels.cu",
  "dispatch_utils.h"
]
include = [ "." ]
depends = [ "torch" ]

[kernel.gptq_marlin]
cuda-capabilities = [ "8.0", "8.6", "8.7", "8.9", "9.0", "10.0", "10.1", "12.0" ]
src = [
  "core/scalar_type.hpp",
  "gptq_marlin/awq_marlin_repack.cu",
  "gptq_marlin/gptq_marlin.cu",
  "gptq_marlin/gptq_marlin_repack.cu",
  "gptq_marlin/marlin.cuh",
  "gptq_marlin/marlin_dtypes.cuh"
]
include = [ "." ]
depends = [ "torch" ]

[kernel.marlin]
cuda-capabilities = [ "8.0", "8.6", "8.7", "8.9", "9.0", "10.0", "10.1", "12.0" ]
src = [
  "core/scalar_type.hpp",
  "marlin/dense/common/base.h",
  "marlin/dense/common/mem.h",
  "marlin/dense/marlin_cuda_kernel.cu",
  "marlin/qqq/marlin_qqq_gemm_kernel.cu",
  "marlin/sparse/common/base.h",
  "marlin/sparse/common/mem.h",
  "marlin/sparse/common/mma.h",
  "marlin/sparse/marlin_24_cuda_kernel.cu"
]
include = [ "." ]
depends = [ "torch" ]