omkarthawakar's picture
Create app.py
38aafd6 verified
import gradio as gr
import torch
from PIL import Image
import os
from transformers import AutoProcessor, AutoModelForImageTextToText
def _detect_device():
if torch.cuda.is_available():
return "cuda"
if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
return "mps"
return "cpu"
DEVICE = _detect_device()
DTYPE = torch.float16 if DEVICE in ("cuda", "mps") else torch.float32
SMOL_ID = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
smol_model = AutoModelForImageTextToText.from_pretrained(
SMOL_ID, dtype=DTYPE
).to(DEVICE)
smol_model.eval()
smol_proc = AutoProcessor.from_pretrained(SMOL_ID)
DEFAULT_SYS = """ Answer the user's question using only what is visible in the provided image.
Reply strictly in the same language as the user's message.
Do not repeat or translate the question and do not add extra commentary — only give the answer.
"""
def _to_pil(x):
if isinstance(x, Image.Image):
return x.convert("RGB")
if isinstance(x, str) and os.path.exists(x):
try:
return Image.open(x).convert("RGB")
except Exception:
return None
return None
def _content_to_model(content):
parts = []
if isinstance(content, list):
for item in content:
if isinstance(item, str):
low = item.lower()
if low.endswith((".png", ".jpg", ".jpeg", ".webp", ".bmp", ".gif")) and os.path.exists(item):
pil = _to_pil(item)
if pil is not None:
parts.append({"type": "image", "image": pil})
else:
parts.append({"type": "text", "text": item})
elif isinstance(item, dict):
ty = item.get("type")
if ty == "image":
pil = _to_pil(item.get("image"))
if pil is not None:
parts.append({"type": "image", "image": pil})
elif ty == "text":
parts.append({"type": "text", "text": item.get("text", "")})
elif "text" in item:
parts.append({"type": "text", "text": item.get("text", "")})
elif "image" in item:
pil = _to_pil(item.get("image"))
if pil is not None:
parts.append({"type": "image", "image": pil})
elif isinstance(content, str):
parts.append({"type": "text", "text": content})
if not parts:
parts = [{"type": "text", "text": ""}]
return parts
def add_user_turn(messages, text, image):
content = []
if image is not None:
content.append({"type": "image", "image": image})
if text and text.strip():
content.append({"type": "text", "text": text.strip()})
if not content:
content = [{"type": "text", "text": ""}]
messages.append({"role": "user", "content": content})
def add_assistant_text(messages, text):
messages.append({"role": "assistant", "content": [{"type": "text", "text": text}]})
@torch.inference_mode()
def gen_smol(messages, max_new_tokens=192):
enc = smol_proc.apply_chat_template(
messages, add_generation_prompt=True, tokenize=True,
return_tensors="pt", return_dict=True
).to(DEVICE)
out = smol_model.generate(
**enc,
max_new_tokens=max_new_tokens,
do_sample=True,
temperature=0.7,
top_p=0.95,
repetition_penalty=1.1,
no_repeat_ngram_size=3,
)
# Trim the prompt portion to avoid echoing system/user
trimmed = out[0, enc["input_ids"].shape[1]:]
text = smol_proc.decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
return text.strip()
EXAMPLES = {
"Arabic sign — OCR + translate": {
"url": "./images/Stop_sign_UAE.jpg",
"prompt": "اقرأ النص على اللافتة وترجمه إلى الإنجليزية."
},
"Diwali rangoli — explain (Hindi)": {
"url": "./images/The_Rangoli_of_Lights.jpg",
"prompt": "इस रंगोली और दीयों का सांस्कृतिक महत्व एक–दो वाक्यों में समझाइए।"
},
"Arabic dallah — identify & meaning": {
"url": "./images/A_dallah_a_traditional_Arabic_coffee_pot_with_cups_and_coffee_beans.jpg",
"prompt": "ما اسم هذا الإناء وما رمزيته الثقافية في الجزيرة العربية؟"
},
"Holi — which festival? (Hindi)": {
"url": "./images/Indian_Festival_of_colors_Holi.jpg",
"prompt": "यह कौन‑सा त्योहार है? दो वाक्यों में बताइए कि लोग रंग क्यों लगाते हैं।"
},
}
EXAMPLE_ROWS = [[v["url"], v["prompt"]] for v in EXAMPLES.values()]
CHAT_EXAMPLES = [{"text": v["prompt"], "files": [v["url"]]} for v in EXAMPLES.values()]
def smol_respond(message, history):
# Build model messages from prior history
model_messages = [
{"role": "system", "content": [{"type": "text", "text": DEFAULT_SYS}]}
]
if isinstance(history, list):
for msg in history:
role = msg.get("role", "user")
content = _content_to_model(msg.get("content", []))
model_messages.append({"role": role, "content": content})
# Current user message
user_text = ""
user_files = []
if isinstance(message, dict):
user_text = message.get("text", "") or ""
user_files = message.get("files", []) or []
elif isinstance(message, str):
user_text = message
user_content = []
for f in user_files:
pil = _to_pil(f)
if pil is not None:
user_content.append({"type": "image", "image": pil})
if user_text:
user_content.append({"type": "text", "text": user_text})
if not user_content:
user_content = [{"type": "text", "text": ""}]
model_messages.append({"role": "user", "content": user_content})
# Generate and return assistant reply text
return gen_smol(model_messages)
# Minimal UI per request: small heading via title and examples at bottom only
demo = gr.ChatInterface(
fn=smol_respond,
type="messages",
autofocus=False,
multimodal=True,
title="Base Vision Chat",
examples=CHAT_EXAMPLES,
cache_examples=False,
)
if __name__ == "__main__":
demo.launch()