vicky4s4s commited on
Commit
7dd9f94
·
verified ·
1 Parent(s): ff16f3d

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +59 -6
README.md CHANGED
@@ -218,12 +218,65 @@ chat_completion = client.chat.completions.create(
218
  To run vLLM with Llama 3.3 70B Instruct AWQ in INT4, you will need to have Docker installed (see [installation notes](https://docs.docker.com/engine/install/)) and run the latest vLLM Docker container as follows:
219
 
220
  ```bash
221
- docker run --runtime nvidia --gpus all --ipc=host -p 8000:8000 \
222
- -v hf_cache:/root/.cache/huggingface \
223
- vllm/vllm-openai:latest \
224
- --model ibnzterrell/Meta-Llama-3.3-70B-Instruct-AWQ-INT4 \
225
- --tensor-parallel-size 4 \
226
- --max-model-len 4096
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  ```
228
 
229
  To send request to the deployed vLLM endpoint compatible with [OpenAI OpenAPI specification](https://github.com/openai/openai-openapi) i.e. `/v1/chat/completions`:
 
218
  To run vLLM with Llama 3.3 70B Instruct AWQ in INT4, you will need to have Docker installed (see [installation notes](https://docs.docker.com/engine/install/)) and run the latest vLLM Docker container as follows:
219
 
220
  ```bash
221
+ import re
222
+ import os
223
+ import asyncio
224
+ import json
225
+
226
+ import time
227
+
228
+ import logging
229
+ import torch
230
+
231
+ from vllm import LLM, SamplingParams
232
+ from transformers import AutoTokenizer
233
+ from awq import AutoAWQForCausalLM
234
+
235
+ logging.getLogger('vllm').setLevel(logging.ERROR)
236
+ logging.basicConfig(level=logging.INFO)
237
+
238
+
239
+ pd.set_option('display.max_columns', None)
240
+
241
+ class vLLMInterfaceAWQ:
242
+ def initializeawq(self):
243
+ model_id = "vicky4s4s/llama-3.3-70b-instruct-AWQ-INT4"
244
+ self.sampling_params = SamplingParams(temperature=0.1, max_tokens=8000, seed=3)
245
+ self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
246
+ self.llm = LLM(model=model_id, trust_remote_code=True, quantization='awq', max_model_len=12000)
247
+
248
+ def inferawq(self, inputs):
249
+ start_time = time.time()
250
+ chat_format = [{"role": "user", "content": inputs["prompt"]}]
251
+ text = self.tokenizer.apply_chat_template(chat_format, tokenize=False, add_generation_prompt=True)
252
+ result = self.llm.generate(text, self.sampling_params)
253
+ end_time = time.time()
254
+ total_time = end_time - start_time
255
+ generated_text = result[0].outputs[0].text
256
+ num_tokens = len(self.tokenizer(generated_text)['input_ids'])
257
+ tokens_per_sec = num_tokens / total_time if total_time > 0 else float('inf')
258
+ return {
259
+ "time_to_start": f"{start_time:.2f} sec",
260
+ "tokens_per_sec": f"{tokens_per_sec:.2f} tokens/sec",
261
+ "total_run_time": f"{total_time:.2f} sec",
262
+ "generated_text":f"{generated_text}"
263
+ }
264
+
265
+ model = vLLMInterfaceAWQ()
266
+ model.initializeawq()
267
+
268
+ async def generate_text_async(prompt, llama_interface):
269
+ loop = asyncio.get_event_loop()
270
+ response = await loop.run_in_executor(None, model.inferawq, {"prompt": prompt})
271
+ return response
272
+
273
+ async def process_queries_async(prompts):
274
+ tasks = [generate_text_async(prompt, model) for prompt in prompts]
275
+ results = await asyncio.gather(*tasks)
276
+ for prompt, result in zip(prompts, results):
277
+ return result
278
+ answer = asyncio.run(process_queries_async(["what is java?"]))
279
+ print(answer)
280
  ```
281
 
282
  To send request to the deployed vLLM endpoint compatible with [OpenAI OpenAPI specification](https://github.com/openai/openai-openapi) i.e. `/v1/chat/completions`: