Spaces:
Running
Running
Add torchao int4 weight only quantization as an option (#34)
Browse files- Add torchao int4 weight only quantization as an option (3795233d4c81c81bca28df592f4c303f1dbaeee6)
- fix(quant): add torchao to quantization option (371c49c8bf1df0569820dcd4aee19441247c159d)
Co-authored-by: Jerry Zhang <[email protected]>
- hardware.yaml +3 -1
- src/kernels.py +8 -1
- src/panel.py +1 -1
- src/utils.py +5 -0
hardware.yaml
CHANGED
|
@@ -19,6 +19,7 @@
|
|
| 19 |
- awq
|
| 20 |
- bnb
|
| 21 |
- gptq
|
|
|
|
| 22 |
backends:
|
| 23 |
- pytorch
|
| 24 |
|
|
@@ -31,6 +32,7 @@
|
|
| 31 |
- awq
|
| 32 |
- bnb
|
| 33 |
- gptq
|
|
|
|
| 34 |
backends:
|
| 35 |
- pytorch
|
| 36 |
|
|
@@ -45,4 +47,4 @@
|
|
| 45 |
backends:
|
| 46 |
- pytorch
|
| 47 |
- openvino
|
| 48 |
-
- onnxruntime
|
|
|
|
| 19 |
- awq
|
| 20 |
- bnb
|
| 21 |
- gptq
|
| 22 |
+
- torchao
|
| 23 |
backends:
|
| 24 |
- pytorch
|
| 25 |
|
|
|
|
| 32 |
- awq
|
| 33 |
- bnb
|
| 34 |
- gptq
|
| 35 |
+
- torchao
|
| 36 |
backends:
|
| 37 |
- pytorch
|
| 38 |
|
|
|
|
| 47 |
backends:
|
| 48 |
- pytorch
|
| 49 |
- openvino
|
| 50 |
+
- onnxruntime
|
src/kernels.py
CHANGED
|
@@ -38,6 +38,7 @@ def get_quant_df(llm_perf_df):
|
|
| 38 |
exllamav2_df = copy_df[(copy_df["Quantization ποΈ"] == "GPTQ.4bit+ExllamaV2")]
|
| 39 |
gemm_df = copy_df[(copy_df["Quantization ποΈ"] == "AWQ.4bit+GEMM")]
|
| 40 |
gemv_df = copy_df[(copy_df["Quantization ποΈ"] == "AWQ.4bit+GEMV")]
|
|
|
|
| 41 |
# merge the three dataframes
|
| 42 |
exllamav1_df = pd.merge(
|
| 43 |
vanilla_df,
|
|
@@ -63,8 +64,14 @@ def get_quant_df(llm_perf_df):
|
|
| 63 |
on=["Model π€"],
|
| 64 |
suffixes=["", " Custom Kernel"],
|
| 65 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
# concat the two dataframes row-wise
|
| 67 |
-
quant_df = pd.concat([exllamav1_df, exllamav2_df, gemm_df, gemv_df])
|
| 68 |
# compute speedups
|
| 69 |
quant_df["Prefill Speedup (%)"] = (
|
| 70 |
(quant_df["Prefill (s)"] / quant_df["Prefill (s) Custom Kernel"]) * 100
|
|
|
|
| 38 |
exllamav2_df = copy_df[(copy_df["Quantization ποΈ"] == "GPTQ.4bit+ExllamaV2")]
|
| 39 |
gemm_df = copy_df[(copy_df["Quantization ποΈ"] == "AWQ.4bit+GEMM")]
|
| 40 |
gemv_df = copy_df[(copy_df["Quantization ποΈ"] == "AWQ.4bit+GEMV")]
|
| 41 |
+
torchao_df = copy_df[(copy_df["Quantization ποΈ"] == "torchao.4bit")]
|
| 42 |
# merge the three dataframes
|
| 43 |
exllamav1_df = pd.merge(
|
| 44 |
vanilla_df,
|
|
|
|
| 64 |
on=["Model π€"],
|
| 65 |
suffixes=["", " Custom Kernel"],
|
| 66 |
)
|
| 67 |
+
torchao_df = pd.merge(
|
| 68 |
+
vanilla_df,
|
| 69 |
+
torchao_df,
|
| 70 |
+
on=["Model π€"],
|
| 71 |
+
suffixes=["", " Custom Kernel"],
|
| 72 |
+
)
|
| 73 |
# concat the two dataframes row-wise
|
| 74 |
+
quant_df = pd.concat([exllamav1_df, exllamav2_df, gemm_df, gemv_df, torchao_df])
|
| 75 |
# compute speedups
|
| 76 |
quant_df["Prefill Speedup (%)"] = (
|
| 77 |
(quant_df["Prefill (s)"] / quant_df["Prefill (s) Custom Kernel"]) * 100
|
src/panel.py
CHANGED
|
@@ -26,7 +26,7 @@ def create_control_panel(
|
|
| 26 |
if hardware_provider == "nvidia":
|
| 27 |
backends = ["pytorch"]
|
| 28 |
attention_implementations = ["Eager", "SDPA", "FAv2"]
|
| 29 |
-
quantizations = ["Unquantized", "BnB.4bit", "BnB.8bit", "AWQ.4bit", "GPTQ.4bit"]
|
| 30 |
kernels = [
|
| 31 |
"No Kernel",
|
| 32 |
"GPTQ.ExllamaV1",
|
|
|
|
| 26 |
if hardware_provider == "nvidia":
|
| 27 |
backends = ["pytorch"]
|
| 28 |
attention_implementations = ["Eager", "SDPA", "FAv2"]
|
| 29 |
+
quantizations = ["Unquantized", "BnB.4bit", "BnB.8bit", "AWQ.4bit", "GPTQ.4bit", "torchao.4bit"]
|
| 30 |
kernels = [
|
| 31 |
"No Kernel",
|
| 32 |
"GPTQ.ExllamaV1",
|
src/utils.py
CHANGED
|
@@ -70,6 +70,11 @@ def process_quantizations(x):
|
|
| 70 |
and x["config.backend.quantization_config.bits"] == 4
|
| 71 |
):
|
| 72 |
return "AWQ.4bit"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
else:
|
| 74 |
return "Unquantized"
|
| 75 |
|
|
|
|
| 70 |
and x["config.backend.quantization_config.bits"] == 4
|
| 71 |
):
|
| 72 |
return "AWQ.4bit"
|
| 73 |
+
elif (
|
| 74 |
+
x["config.backend.quantization_scheme"] == "torchao"
|
| 75 |
+
and x["config.backend.quantization_config.quant_type"] == "int4_weight_only"
|
| 76 |
+
):
|
| 77 |
+
return "torchao.4bit"
|
| 78 |
else:
|
| 79 |
return "Unquantized"
|
| 80 |
|