mobicham commited on
Commit
c720f90
·
verified ·
1 Parent(s): f94fcda

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +49 -0
README.md CHANGED
@@ -74,4 +74,53 @@ To solve the addition \(1.5 + 102.2\), follow these steps:
74
  \]
75
 
76
  So, the final answer is \(\boxed{103.7}\).<|end▁of▁sentence|>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  ```
 
74
  \]
75
 
76
  So, the final answer is \(\boxed{103.7}\).<|end▁of▁sentence|>
77
+ ```
78
+
79
+ ## HQQ
80
+ Run ~3.5x faster with <a href="https://github.com/mobiusml/hqq/">HQQ</a>. First, install the dependencies:
81
+ ```
82
+ pip install hqq
83
+ ```
84
+
85
+ ```Python
86
+ import torch
87
+ from transformers import AutoModelForCausalLM, AutoTokenizer
88
+ from hqq.models.hf.base import AutoHQQHFModel
89
+ from hqq.core.quantize import *
90
+
91
+ #Params
92
+ device = 'cuda:0'
93
+ backend = "torchao_int4"
94
+ compute_dtype = torch.bfloat16 if backend=="torchao_int4" else torch.float16
95
+ model_id = "mobiuslabsgmbh/DeepSeek-R1-ReDistill-Qwen-7B-v1.1"
96
+
97
+ #Load
98
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
99
+ model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=compute_dtype, attn_implementation="sdpa")
100
+
101
+ #Quantize
102
+ quant_config = BaseQuantizeConfig(nbits=4, group_size=64, axis=1)
103
+ AutoHQQHFModel.quantize_model(model, quant_config=quant_config, compute_dtype=compute_dtype, device=device)
104
+
105
+ #Optimize
106
+ from hqq.utils.patching import prepare_for_inference
107
+ prepare_for_inference(model, backend=backend, verbose=False)
108
+
109
+ ############################################################
110
+ #Generate (streaming)
111
+ from hqq.utils.generation_hf import HFGenerator
112
+ gen = HFGenerator(model, tokenizer, max_new_tokens=8192, do_sample=True, compile='partial').warmup()
113
+
114
+ prompt = "If A equals B, and C equals B - A, what would be the value of C?"
115
+ out = gen.generate(prompt, print_tokens=True)
116
+
117
+ ############################################################
118
+ # #Generate (simple)
119
+ # from hqq.utils.generation_hf import patch_model_for_compiled_runtime
120
+ # patch_model_for_compiled_runtime(model, tokenizer, warmup=True)
121
+
122
+ # prompt = "If A equals B, and C equals B - A, what would be the value of C?"
123
+ # chat = tokenizer.apply_chat_template([{"role":"user", "content":prompt}], tokenize=True, add_generation_prompt=True, return_tensors="pt")
124
+ # outputs = model.generate(chat.to(device), max_new_tokens=8192, do_sample=True)
125
+ # print(tokenizer.decode(outputs[0]))
126
  ```