Update README.md
Browse files
README.md
CHANGED
@@ -218,12 +218,65 @@ chat_completion = client.chat.completions.create(
|
|
218 |
To run vLLM with Llama 3.3 70B Instruct AWQ in INT4, you will need to have Docker installed (see [installation notes](https://docs.docker.com/engine/install/)) and run the latest vLLM Docker container as follows:
|
219 |
|
220 |
```bash
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
```
|
228 |
|
229 |
To send request to the deployed vLLM endpoint compatible with [OpenAI OpenAPI specification](https://github.com/openai/openai-openapi) i.e. `/v1/chat/completions`:
|
|
|
218 |
To run vLLM with Llama 3.3 70B Instruct AWQ in INT4, you will need to have Docker installed (see [installation notes](https://docs.docker.com/engine/install/)) and run the latest vLLM Docker container as follows:
|
219 |
|
220 |
```bash
|
221 |
+
import re
|
222 |
+
import os
|
223 |
+
import asyncio
|
224 |
+
import json
|
225 |
+
|
226 |
+
import time
|
227 |
+
|
228 |
+
import logging
|
229 |
+
import torch
|
230 |
+
|
231 |
+
from vllm import LLM, SamplingParams
|
232 |
+
from transformers import AutoTokenizer
|
233 |
+
from awq import AutoAWQForCausalLM
|
234 |
+
|
235 |
+
logging.getLogger('vllm').setLevel(logging.ERROR)
|
236 |
+
logging.basicConfig(level=logging.INFO)
|
237 |
+
|
238 |
+
|
239 |
+
pd.set_option('display.max_columns', None)
|
240 |
+
|
241 |
+
class vLLMInterfaceAWQ:
|
242 |
+
def initializeawq(self):
|
243 |
+
model_id = "vicky4s4s/llama-3.3-70b-instruct-AWQ-INT4"
|
244 |
+
self.sampling_params = SamplingParams(temperature=0.1, max_tokens=8000, seed=3)
|
245 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
|
246 |
+
self.llm = LLM(model=model_id, trust_remote_code=True, quantization='awq', max_model_len=12000)
|
247 |
+
|
248 |
+
def inferawq(self, inputs):
|
249 |
+
start_time = time.time()
|
250 |
+
chat_format = [{"role": "user", "content": inputs["prompt"]}]
|
251 |
+
text = self.tokenizer.apply_chat_template(chat_format, tokenize=False, add_generation_prompt=True)
|
252 |
+
result = self.llm.generate(text, self.sampling_params)
|
253 |
+
end_time = time.time()
|
254 |
+
total_time = end_time - start_time
|
255 |
+
generated_text = result[0].outputs[0].text
|
256 |
+
num_tokens = len(self.tokenizer(generated_text)['input_ids'])
|
257 |
+
tokens_per_sec = num_tokens / total_time if total_time > 0 else float('inf')
|
258 |
+
return {
|
259 |
+
"time_to_start": f"{start_time:.2f} sec",
|
260 |
+
"tokens_per_sec": f"{tokens_per_sec:.2f} tokens/sec",
|
261 |
+
"total_run_time": f"{total_time:.2f} sec",
|
262 |
+
"generated_text":f"{generated_text}"
|
263 |
+
}
|
264 |
+
|
265 |
+
model = vLLMInterfaceAWQ()
|
266 |
+
model.initializeawq()
|
267 |
+
|
268 |
+
async def generate_text_async(prompt, llama_interface):
|
269 |
+
loop = asyncio.get_event_loop()
|
270 |
+
response = await loop.run_in_executor(None, model.inferawq, {"prompt": prompt})
|
271 |
+
return response
|
272 |
+
|
273 |
+
async def process_queries_async(prompts):
|
274 |
+
tasks = [generate_text_async(prompt, model) for prompt in prompts]
|
275 |
+
results = await asyncio.gather(*tasks)
|
276 |
+
for prompt, result in zip(prompts, results):
|
277 |
+
return result
|
278 |
+
answer = asyncio.run(process_queries_async(["what is java?"]))
|
279 |
+
print(answer)
|
280 |
```
|
281 |
|
282 |
To send request to the deployed vLLM endpoint compatible with [OpenAI OpenAPI specification](https://github.com/openai/openai-openapi) i.e. `/v1/chat/completions`:
|