import json
import torch
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
user_content = "Please write a short story about a cat who becomes a detective. The story should be humorous and no more than 500 words. Set it in a bustling city."
messages = [
{"role": "user", "content": user_content}
]
model_name = "jungseob/qwen3-4b-instruct-clarity"
tokenizer = AutoTokenizer.from_pretrained(model_name)
num_gpus = torch.cuda.device_count()
llm = LLM(
model=model_name,
tensor_parallel_size=num_gpus,
trust_remote_code=True
)
prompt_string = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
sampling_params = SamplingParams(
max_tokens=400,
temperature=0,
top_p=1.0
)
outputs = llm.generate([prompt_string], sampling_params)
generated_text = outputs[0].outputs[0].text
print(json.loads(generated_text))