Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| from PIL import Image | |
| import os | |
| from transformers import AutoProcessor, AutoModelForImageTextToText | |
| def _detect_device(): | |
| if torch.cuda.is_available(): | |
| return "cuda" | |
| if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): | |
| return "mps" | |
| return "cpu" | |
| DEVICE = _detect_device() | |
| DTYPE = torch.float16 if DEVICE in ("cuda", "mps") else torch.float32 | |
| SMOL_ID = "HuggingFaceTB/SmolVLM2-2.2B-Instruct" | |
| smol_model = AutoModelForImageTextToText.from_pretrained( | |
| SMOL_ID, dtype=DTYPE | |
| ).to(DEVICE) | |
| smol_model.eval() | |
| smol_proc = AutoProcessor.from_pretrained(SMOL_ID) | |
| DEFAULT_SYS = """ Answer the user's question using only what is visible in the provided image. | |
| Reply strictly in the same language as the user's message. | |
| Do not repeat or translate the question and do not add extra commentary — only give the answer. | |
| """ | |
| def _to_pil(x): | |
| if isinstance(x, Image.Image): | |
| return x.convert("RGB") | |
| if isinstance(x, str) and os.path.exists(x): | |
| try: | |
| return Image.open(x).convert("RGB") | |
| except Exception: | |
| return None | |
| return None | |
| def _content_to_model(content): | |
| parts = [] | |
| if isinstance(content, list): | |
| for item in content: | |
| if isinstance(item, str): | |
| low = item.lower() | |
| if low.endswith((".png", ".jpg", ".jpeg", ".webp", ".bmp", ".gif")) and os.path.exists(item): | |
| pil = _to_pil(item) | |
| if pil is not None: | |
| parts.append({"type": "image", "image": pil}) | |
| else: | |
| parts.append({"type": "text", "text": item}) | |
| elif isinstance(item, dict): | |
| ty = item.get("type") | |
| if ty == "image": | |
| pil = _to_pil(item.get("image")) | |
| if pil is not None: | |
| parts.append({"type": "image", "image": pil}) | |
| elif ty == "text": | |
| parts.append({"type": "text", "text": item.get("text", "")}) | |
| elif "text" in item: | |
| parts.append({"type": "text", "text": item.get("text", "")}) | |
| elif "image" in item: | |
| pil = _to_pil(item.get("image")) | |
| if pil is not None: | |
| parts.append({"type": "image", "image": pil}) | |
| elif isinstance(content, str): | |
| parts.append({"type": "text", "text": content}) | |
| if not parts: | |
| parts = [{"type": "text", "text": ""}] | |
| return parts | |
| def add_user_turn(messages, text, image): | |
| content = [] | |
| if image is not None: | |
| content.append({"type": "image", "image": image}) | |
| if text and text.strip(): | |
| content.append({"type": "text", "text": text.strip()}) | |
| if not content: | |
| content = [{"type": "text", "text": ""}] | |
| messages.append({"role": "user", "content": content}) | |
| def add_assistant_text(messages, text): | |
| messages.append({"role": "assistant", "content": [{"type": "text", "text": text}]}) | |
| def gen_smol(messages, max_new_tokens=192): | |
| enc = smol_proc.apply_chat_template( | |
| messages, add_generation_prompt=True, tokenize=True, | |
| return_tensors="pt", return_dict=True | |
| ).to(DEVICE) | |
| out = smol_model.generate( | |
| **enc, | |
| max_new_tokens=max_new_tokens, | |
| do_sample=True, | |
| temperature=0.7, | |
| top_p=0.95, | |
| repetition_penalty=1.1, | |
| no_repeat_ngram_size=3, | |
| ) | |
| # Trim the prompt portion to avoid echoing system/user | |
| trimmed = out[0, enc["input_ids"].shape[1]:] | |
| text = smol_proc.decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False) | |
| return text.strip() | |
| EXAMPLES = { | |
| "Arabic sign — OCR + translate": { | |
| "url": "./images/Stop_sign_UAE.jpg", | |
| "prompt": "اقرأ النص على اللافتة وترجمه إلى الإنجليزية." | |
| }, | |
| "Diwali rangoli — explain (Hindi)": { | |
| "url": "./images/The_Rangoli_of_Lights.jpg", | |
| "prompt": "इस रंगोली और दीयों का सांस्कृतिक महत्व एक–दो वाक्यों में समझाइए।" | |
| }, | |
| "Arabic dallah — identify & meaning": { | |
| "url": "./images/A_dallah_a_traditional_Arabic_coffee_pot_with_cups_and_coffee_beans.jpg", | |
| "prompt": "ما اسم هذا الإناء وما رمزيته الثقافية في الجزيرة العربية؟" | |
| }, | |
| "Holi — which festival? (Hindi)": { | |
| "url": "./images/Indian_Festival_of_colors_Holi.jpg", | |
| "prompt": "यह कौन‑सा त्योहार है? दो वाक्यों में बताइए कि लोग रंग क्यों लगाते हैं।" | |
| }, | |
| } | |
| EXAMPLE_ROWS = [[v["url"], v["prompt"]] for v in EXAMPLES.values()] | |
| CHAT_EXAMPLES = [{"text": v["prompt"], "files": [v["url"]]} for v in EXAMPLES.values()] | |
| def smol_respond(message, history): | |
| # Build model messages from prior history | |
| model_messages = [ | |
| {"role": "system", "content": [{"type": "text", "text": DEFAULT_SYS}]} | |
| ] | |
| if isinstance(history, list): | |
| for msg in history: | |
| role = msg.get("role", "user") | |
| content = _content_to_model(msg.get("content", [])) | |
| model_messages.append({"role": role, "content": content}) | |
| # Current user message | |
| user_text = "" | |
| user_files = [] | |
| if isinstance(message, dict): | |
| user_text = message.get("text", "") or "" | |
| user_files = message.get("files", []) or [] | |
| elif isinstance(message, str): | |
| user_text = message | |
| user_content = [] | |
| for f in user_files: | |
| pil = _to_pil(f) | |
| if pil is not None: | |
| user_content.append({"type": "image", "image": pil}) | |
| if user_text: | |
| user_content.append({"type": "text", "text": user_text}) | |
| if not user_content: | |
| user_content = [{"type": "text", "text": ""}] | |
| model_messages.append({"role": "user", "content": user_content}) | |
| # Generate and return assistant reply text | |
| return gen_smol(model_messages) | |
| # Minimal UI per request: small heading via title and examples at bottom only | |
| demo = gr.ChatInterface( | |
| fn=smol_respond, | |
| type="messages", | |
| autofocus=False, | |
| multimodal=True, | |
| title="Base Vision Chat", | |
| examples=CHAT_EXAMPLES, | |
| cache_examples=False, | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |