Spaces:

omkarthawakar
/

base-model-cv805

Sleeping

App Files Files Community

base-model-cv805 / app.py

omkarthawakar

Create app.py

38aafd6 verified about 2 months ago

raw

history blame contribute delete

6.52 kB

	import gradio as gr
	import torch
	from PIL import Image
	import os

	from transformers import AutoProcessor, AutoModelForImageTextToText


	def _detect_device():
	if torch.cuda.is_available():
	return "cuda"
	if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
	return "mps"
	return "cpu"


	DEVICE = _detect_device()
	DTYPE = torch.float16 if DEVICE in ("cuda", "mps") else torch.float32

	SMOL_ID = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"

	smol_model = AutoModelForImageTextToText.from_pretrained(
	SMOL_ID, dtype=DTYPE
	).to(DEVICE)
	smol_model.eval()
	smol_proc = AutoProcessor.from_pretrained(SMOL_ID)

	DEFAULT_SYS = """ Answer the user's question using only what is visible in the provided image.
	Reply strictly in the same language as the user's message.
	Do not repeat or translate the question and do not add extra commentary — only give the answer.
	"""

	def _to_pil(x):
	if isinstance(x, Image.Image):
	return x.convert("RGB")
	if isinstance(x, str) and os.path.exists(x):
	try:
	return Image.open(x).convert("RGB")
	except Exception:
	return None
	return None


	def _content_to_model(content):
	parts = []
	if isinstance(content, list):
	for item in content:
	if isinstance(item, str):
	low = item.lower()
	if low.endswith((".png", ".jpg", ".jpeg", ".webp", ".bmp", ".gif")) and os.path.exists(item):
	pil = _to_pil(item)
	if pil is not None:
	parts.append({"type": "image", "image": pil})
	else:
	parts.append({"type": "text", "text": item})
	elif isinstance(item, dict):
	ty = item.get("type")
	if ty == "image":
	pil = _to_pil(item.get("image"))
	if pil is not None:
	parts.append({"type": "image", "image": pil})
	elif ty == "text":
	parts.append({"type": "text", "text": item.get("text", "")})
	elif "text" in item:
	parts.append({"type": "text", "text": item.get("text", "")})
	elif "image" in item:
	pil = _to_pil(item.get("image"))
	if pil is not None:
	parts.append({"type": "image", "image": pil})
	elif isinstance(content, str):
	parts.append({"type": "text", "text": content})
	if not parts:
	parts = [{"type": "text", "text": ""}]
	return parts


	def add_user_turn(messages, text, image):
	content = []
	if image is not None:
	content.append({"type": "image", "image": image})
	if text and text.strip():
	content.append({"type": "text", "text": text.strip()})
	if not content:
	content = [{"type": "text", "text": ""}]
	messages.append({"role": "user", "content": content})


	def add_assistant_text(messages, text):
	messages.append({"role": "assistant", "content": [{"type": "text", "text": text}]})


	@torch.inference_mode()
	def gen_smol(messages, max_new_tokens=192):
	enc = smol_proc.apply_chat_template(
	messages, add_generation_prompt=True, tokenize=True,
	return_tensors="pt", return_dict=True
	).to(DEVICE)
	out = smol_model.generate(
	**enc,
	max_new_tokens=max_new_tokens,
	do_sample=True,
	temperature=0.7,
	top_p=0.95,
	repetition_penalty=1.1,
	no_repeat_ngram_size=3,
	)
	# Trim the prompt portion to avoid echoing system/user
	trimmed = out[0, enc["input_ids"].shape[1]:]
	text = smol_proc.decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
	return text.strip()


	EXAMPLES = {
	"Arabic sign — OCR + translate": {
	"url": "./images/Stop_sign_UAE.jpg",
	"prompt": "اقرأ النص على اللافتة وترجمه إلى الإنجليزية."
	},
	"Diwali rangoli — explain (Hindi)": {
	"url": "./images/The_Rangoli_of_Lights.jpg",
	"prompt": "इस रंगोली और दीयों का सांस्कृतिक महत्व एक–दो वाक्यों में समझाइए।"
	},
	"Arabic dallah — identify & meaning": {
	"url": "./images/A_dallah_a_traditional_Arabic_coffee_pot_with_cups_and_coffee_beans.jpg",
	"prompt": "ما اسم هذا الإناء وما رمزيته الثقافية في الجزيرة العربية؟"
	},
	"Holi — which festival? (Hindi)": {
	"url": "./images/Indian_Festival_of_colors_Holi.jpg",
	"prompt": "यह कौन‑सा त्योहार है? दो वाक्यों में बताइए कि लोग रंग क्यों लगाते हैं।"
	},
	}

	EXAMPLE_ROWS = [[v["url"], v["prompt"]] for v in EXAMPLES.values()]
	CHAT_EXAMPLES = [{"text": v["prompt"], "files": [v["url"]]} for v in EXAMPLES.values()]


	def smol_respond(message, history):
	# Build model messages from prior history
	model_messages = [
	{"role": "system", "content": [{"type": "text", "text": DEFAULT_SYS}]}
	]
	if isinstance(history, list):
	for msg in history:
	role = msg.get("role", "user")
	content = _content_to_model(msg.get("content", []))
	model_messages.append({"role": role, "content": content})

	# Current user message
	user_text = ""
	user_files = []
	if isinstance(message, dict):
	user_text = message.get("text", "") or ""
	user_files = message.get("files", []) or []
	elif isinstance(message, str):
	user_text = message

	user_content = []
	for f in user_files:
	pil = _to_pil(f)
	if pil is not None:
	user_content.append({"type": "image", "image": pil})
	if user_text:
	user_content.append({"type": "text", "text": user_text})
	if not user_content:
	user_content = [{"type": "text", "text": ""}]
	model_messages.append({"role": "user", "content": user_content})

	# Generate and return assistant reply text
	return gen_smol(model_messages)


	# Minimal UI per request: small heading via title and examples at bottom only
	demo = gr.ChatInterface(
	fn=smol_respond,
	type="messages",
	autofocus=False,
	multimodal=True,
	title="Base Vision Chat",
	examples=CHAT_EXAMPLES,
	cache_examples=False,
	)

	if __name__ == "__main__":
	demo.launch()