vLLM docker run
version: '3.3'
services:
pdf-parser:
image: vllm/vllm-openai:v0.7.0
container_name: pdf-parser
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ['3']
capabilities: [gpu]
# environment:
# - CUDA_VISIBLE_DEVICES=0
# - VLLM_ATTENTION_BACKEND=FLASHINFER
ports:
- "${PARSER_PORT}:${PARSER_PORT}"
volumes:
- type: bind
source: ${HF_CACHE_DIR}
target: /root/.cache/huggingface
command: --served-model-name ${PARSER_MODEL_NAME} --model ds4sd/SmolDocling-256M-preview --dtype bfloat16 --host 0.0.0.0 --port ${PARSER_PORT} --api-key ${PARSER_API_KEY} --max-model-len 8192 --gpu-memory-utilization 0.5
from openai import OpenAI
import base64
from PIL import Image
import io
# Initialize the client
# Initialize the client
# Initialize the client
client = OpenAI(
api_key="dummy",
base_url="http://210.211.99.4:8082/v1"
)
model_name = "SmolDocling"
PROMPT_TEXT = "Convert page to Docling."
prompt = f"<|im_start|>User:<image>{PROMPT_TEXT}<end_of_utterance>\nAssistant:"
# Function to encode image to base64
def encode_image(image_path, max_size=512):
# Open the image
img = Image.open(image_path)
# Resize if larger than max_size
width, height = img.size
if max(width, height) > max_size:
# Calculate new dimensions while preserving aspect ratio
scale = max_size / max(width, height)
new_width = int(width * scale)
new_height = int(height * scale)
img = img.resize((new_width, new_height), Image.LANCZOS)
print(f"Image size: {img.size}")
# Convert to bytes
buffer = io.BytesIO()
img.save(buffer, format="JPEG" if img.format == "JPEG" else "PNG")
buffer.seek(0)
# Encode to base64
return base64.b64encode(buffer.read()).decode("utf-8")
# Path to your image
image_path = "./pdf_images/page_1.png"
# Get the base64 string
base64_image = encode_image(image_path)
# Create the payload
response = client.chat.completions.create(
model=model_name,
messages=[
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": "https://stg-s3.distilled.ai/distilled/images/page_1.png"}},
{"type": "text", "text": f"{PROMPT_TEXT}"}
]
}
],
max_tokens=300
)
# Print the response
print(response.choices[0].message.content)
Logs
```
BadRequestError Traceback (most recent call last)
Cell In[4], line 50
47 base64_image = encode_image(image_path)
49 # Create the payload
---> 50 response = client.chat.completions.create(
51 model=model_name,
52 messages=[
53 {
54 "role": "user",
55 "content": [
56 {"type": "image_url", "image_url": {"url": "https://stg-s3.distilled.ai/distilled/images/page_1.png"}},
57 {"type": "text", "text": f"{PROMPT_TEXT}"}
58 ]
59 }
60 ],
61 max_tokens=300
62 )
64 # Print the response
65 print(response.choices[0].message.content)
File ~/.venv/lib/python3.10/site-packages/openai/_utils/_utils.py:279, in required_args..inner..wrapper(*args, **kwargs)
277 msg = f"Missing required argument: {quote(missing[0])}"
278 raise TypeError(msg)
--> 279 return func(*args, **kwargs)
File ~/.venv/lib/python3.10/site-packages/openai/resources/chat/completions/completions.py:879, in Completions.create(self, messages, model, audio, frequency_penalty, function_call, functions, logit_bias, logprobs, max_completion_tokens, max_tokens, metadata, modalities, n, parallel_tool_calls, prediction, presence_penalty, reasoning_effort, response_format, seed, service_tier, stop, store, stream, stream_options, temperature, tool_choice, tools, top_logprobs, top_p, user, extra_headers, extra_query, extra_body, timeout)
837 @required_args(["messages", "model"], ["messages", "model", "stream"])
838 def create(
839 self,
(...)
876 timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
877 ) -> ChatCompletion | Stream[ChatCompletionChunk]:
878 validate_response_format(response_format)
--> 879 return self._post(
880 "/chat/completions",
881 body=maybe_transform(
882 {
883 "messages": messages,
884 "model": model,
885 "audio": audio,
886 "frequency_penalty": frequency_penalty,
887 "function_call": function_call,
888 "functions": functions,
889 "logit_bias": logit_bias,
890 "logprobs": logprobs,
891 "max_completion_tokens": max_completion_tokens,
892 "max_tokens": max_tokens,
893 "metadata": metadata,
894 "modalities": modalities,
895 "n": n,
896 "parallel_tool_calls": parallel_tool_calls,
897 "prediction": prediction,
898 "presence_penalty": presence_penalty,
899 "reasoning_effort": reasoning_effort,
900 "response_format": response_format,
901 "seed": seed,
902 "service_tier": service_tier,
903 "stop": stop,
904 "store": store,
905 "stream": stream,
906 "stream_options": stream_options,
907 "temperature": temperature,
908 "tool_choice": tool_choice,
909 "tools": tools,
910 "top_logprobs": top_logprobs,
911 "top_p": top_p,
912 "user": user,
913 },
914 completion_create_params.CompletionCreateParams,
915 ),
916 options=make_request_options(
917 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
918 ),
919 cast_to=ChatCompletion,
920 stream=stream or False,
921 stream_cls=Stream[ChatCompletionChunk],
922 )
File ~/.venv/lib/python3.10/site-packages/openai/_base_client.py:1296, in SyncAPIClient.post(self, path, cast_to, body, options, files, stream, stream_cls)
1282 def post(
1283 self,
1284 path: str,
(...)
1291 stream_cls: type[_StreamT] | None = None,
1292 ) -> ResponseT | _StreamT:
1293 opts = FinalRequestOptions.construct(
1294 method="post", url=path, json_data=body, files=to_httpx_files(files), **options
1295 )
-> 1296 return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))
File ~/.venv/lib/python3.10/site-packages/openai/_base_client.py:973, in SyncAPIClient.request(self, cast_to, options, remaining_retries, stream, stream_cls)
970 else:
971 retries_taken = 0
--> 973 return self._request(
974 cast_to=cast_to,
975 options=options,
976 stream=stream,
977 stream_cls=stream_cls,
978 retries_taken=retries_taken,
979 )
File ~/.venv/lib/python3.10/site-packages/openai/_base_client.py:1077, in SyncAPIClient._request(self, cast_to, options, retries_taken, stream, stream_cls)
1074 err.response.read()
1076 log.debug("Re-raising status error")
-> 1077 raise self._make_status_error_from_response(err.response) from None
1079 return self._process_response(
1080 cast_to=cast_to,
1081 options=options,
(...)
1085 retries_taken=retries_taken,
1086 )
BadRequestError: Error code: 400 - {'object': 'error', 'message': 'resolution_max_side
cannot be larger than max_image_size
', 'type': 'BadRequestError', 'param': None, 'code': 400}
```
This problem is that the vllm version is low, I used python 3.11 to install the latest vllm to solve it