Running with vLLM
#54
by
Syafiqmuda
- opened
Hello,
I am trying to run this model using vLLM, but it seem I got value error state:
ValueError: The decoder prompt (length 58700) is longer than the maximum model length of 32768. Make sure that `max_model_len` is no smaller than the number of text tokens plus multimodal tokens. For image inputs, the number of image tokens depends on the number of images, and possibly their aspect ratios as well.
is it because I try to use large image?
here is the code:
from fastapi import FastAPI, File, UploadFile, Form
from fastapi.responses import JSONResponse
import base64
import httpx
from vllm import LLM, SamplingParams
app = FastAPI()
# Load the multimodal model once on startup
llm = LLM(
model="Qwen/Qwen2.5-Omni-7B",
tensor_parallel_size=4 # Spread model across 4 GPUs
)
@app
.post("/generate/")
async def generate_response(
text: str = Form(...),
image: UploadFile = File(None), # Optional file upload
image_url: str = Form(None) # Optional image URL
):
image_bytes = None
if image is not None:
# Use uploaded file if provided
image_bytes = await image.read()
elif image_url:
# Download image from URL
async with httpx.AsyncClient() as client:
response = await client.get(image_url)
if response.status_code != 200:
return JSONResponse({"error": "Failed to download image from URL"}, status_code=400)
image_bytes = response.content
else:
return JSONResponse({"error": "No image file or URL provided"}, status_code=400)
# Encode image bytes to base64 string
image_b64 = base64.b64encode(image_bytes).decode("utf-8")
# Prepare prompt with text and base64 image
prompt = (
"<|im_start|>system\n"
"You are a helpful assistant.\n"
"<|im_end|>\n"
"<|im_start|>user\n"
f"{text}\n"
f"<image:{image_b64}>\n"
"<|im_end|>\n"
"<|im_start|>assistant\n"
)
# Set generation parameters
sampling_params = SamplingParams(
temperature=0.7,
top_p=0.9,
max_tokens=512,
)
# Generate response
outputs = llm.generate(prompt, sampling_params)
# Extract generated text
generated_text = outputs[0].outputs[0].text.strip()
return JSONResponse({"response": generated_text})