Qwen/Qwen2.5-Omni-7B · Running with vLLM

Hello,

I am trying to run this model using vLLM, but it seem I got value error state:

ValueError: The decoder prompt (length 58700) is longer than the maximum model length of 32768. Make sure that `max_model_len` is no smaller than the number of text tokens plus multimodal tokens. For image inputs, the number of image tokens depends on the number of images, and possibly their aspect ratios as well.

is it because I try to use large image?

here is the code:

from fastapi import FastAPI, File, UploadFile, Form
from fastapi.responses import JSONResponse
import base64
import httpx
from vllm import LLM, SamplingParams

app = FastAPI()

# Load the multimodal model once on startup
llm = LLM(
    model="Qwen/Qwen2.5-Omni-7B",
    tensor_parallel_size=4  # Spread model across 4 GPUs
)



@app
	.post("/generate/")
async def generate_response(
    text: str = Form(...),
    image: UploadFile = File(None),          # Optional file upload
    image_url: str = Form(None)               # Optional image URL
):
    image_bytes = None

    if image is not None:
        # Use uploaded file if provided
        image_bytes = await image.read()
    elif image_url:
        # Download image from URL
        async with httpx.AsyncClient() as client:
            response = await client.get(image_url)
            if response.status_code != 200:
                return JSONResponse({"error": "Failed to download image from URL"}, status_code=400)
            image_bytes = response.content
    else:
        return JSONResponse({"error": "No image file or URL provided"}, status_code=400)

    # Encode image bytes to base64 string
    image_b64 = base64.b64encode(image_bytes).decode("utf-8")

    # Prepare prompt with text and base64 image
    prompt = (
        "<|im_start|>system\n"
        "You are a helpful assistant.\n"
        "<|im_end|>\n"
        "<|im_start|>user\n"
        f"{text}\n"
        f"<image:{image_b64}>\n"
        "<|im_end|>\n"
        "<|im_start|>assistant\n"
    )

    # Set generation parameters
    sampling_params = SamplingParams(
        temperature=0.7,
        top_p=0.9,
        max_tokens=512,
    )

    # Generate response
    outputs = llm.generate(prompt, sampling_params)

    # Extract generated text
    generated_text = outputs[0].outputs[0].text.strip()

    return JSONResponse({"response": generated_text})