vicky4s4s
/

llama-3.3-70b-instruct-AWQ-INT4

@@ -218,12 +218,65 @@ chat_completion = client.chat.completions.create(
 To run vLLM with Llama 3.3 70B Instruct AWQ in INT4, you will need to have Docker installed (see [installation notes](https://docs.docker.com/engine/install/)) and run the latest vLLM Docker container as follows:
 ```bash
-docker run --runtime nvidia --gpus all --ipc=host -p 8000:8000 \
-  -v hf_cache:/root/.cache/huggingface \
-  vllm/vllm-openai:latest \
-  --model ibnzterrell/Meta-Llama-3.3-70B-Instruct-AWQ-INT4 \
-  --tensor-parallel-size 4 \
-  --max-model-len 4096
 ```
 To send request to the deployed vLLM endpoint compatible with [OpenAI OpenAPI specification](https://github.com/openai/openai-openapi) i.e. `/v1/chat/completions`:

 To run vLLM with Llama 3.3 70B Instruct AWQ in INT4, you will need to have Docker installed (see [installation notes](https://docs.docker.com/engine/install/)) and run the latest vLLM Docker container as follows:
 ```bash
+import re
+import os
+import asyncio
+import json
+import time
+import logging
+import torch
+from vllm import LLM, SamplingParams
+from transformers import AutoTokenizer
+from awq import AutoAWQForCausalLM
+logging.getLogger('vllm').setLevel(logging.ERROR)
+logging.basicConfig(level=logging.INFO)
+pd.set_option('display.max_columns', None)
+class vLLMInterfaceAWQ:
+    def initializeawq(self):
+        model_id = "vicky4s4s/llama-3.3-70b-instruct-AWQ-INT4"
+        self.sampling_params = SamplingParams(temperature=0.1, max_tokens=8000, seed=3)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+        self.llm = LLM(model=model_id, trust_remote_code=True, quantization='awq', max_model_len=12000)
+    def inferawq(self, inputs):
+        start_time = time.time()
+        chat_format = [{"role": "user", "content": inputs["prompt"]}]
+        text = self.tokenizer.apply_chat_template(chat_format, tokenize=False, add_generation_prompt=True)
+        result = self.llm.generate(text, self.sampling_params)
+        end_time = time.time()
+        total_time = end_time - start_time
+        generated_text = result[0].outputs[0].text
+        num_tokens = len(self.tokenizer(generated_text)['input_ids'])
+        tokens_per_sec = num_tokens / total_time if total_time > 0 else float('inf')
+        return {
+            "time_to_start": f"{start_time:.2f} sec",
+            "tokens_per_sec": f"{tokens_per_sec:.2f} tokens/sec",
+            "total_run_time": f"{total_time:.2f} sec",
+            "generated_text":f"{generated_text}"
+        }
+model = vLLMInterfaceAWQ()
+model.initializeawq()
+async def generate_text_async(prompt, llama_interface):
+    loop = asyncio.get_event_loop()
+    response = await loop.run_in_executor(None, model.inferawq, {"prompt": prompt})
+    return response
+async def process_queries_async(prompts):
+    tasks = [generate_text_async(prompt, model) for prompt in prompts]
+    results = await asyncio.gather(*tasks)
+    for prompt, result in zip(prompts, results):
+        return result
+answer = asyncio.run(process_queries_async(["what is java?"]))
+print(answer)
 ```
 To send request to the deployed vLLM endpoint compatible with [OpenAI OpenAPI specification](https://github.com/openai/openai-openapi) i.e. `/v1/chat/completions`: