import gradio as gr import torch try: from unsloth import FastVisionModel UNSLOTH_AVAILABLE = True except (ImportError, NotImplementedError): print("Unsloth not available or not supported, falling back to standard transformers") from transformers import AutoModelForVision2Seq, AutoProcessor UNSLOTH_AVAILABLE = False from PIL import Image import os import warnings # Disable torch compilation to avoid inductor backend issues if hasattr(torch, '_dynamo'): torch._dynamo.config.disable = True if hasattr(torch.backends, 'cudnn'): torch.backends.cudnn.allow_tf32 = True if hasattr(torch.backends, 'cuda'): torch.backends.cuda.matmul.allow_tf32 = True # Suppress warnings warnings.filterwarnings('ignore') # Global variables for model and processor model = None processor = None def load_model(): """Load the trained plant model""" global model, processor try: print("Loading base model...") if UNSLOTH_AVAILABLE: print("Using Unsloth FastVisionModel...") # Disable torch compilation for inference if hasattr(torch, '_dynamo'): torch._dynamo.reset() # Load the base model with unsloth model, processor = FastVisionModel.from_pretrained( "unsloth/gemma-3n-e2b-it-unsloth-bnb-4bit", load_in_4bit=True, use_gradient_checkpointing="unsloth", ) print("Setting up LoRA configuration...") # Set up LoRA configuration to match training model = FastVisionModel.get_peft_model( model, finetune_vision_layers=True, finetune_language_layers=True, finetune_attention_modules=True, finetune_mlp_modules=True, r=16, lora_alpha=16, lora_dropout=0, bias="none", random_state=3407, use_rslora=False, loftq_config=None, target_modules="all-linear", modules_to_save=[ "lm_head", "embed_tokens", ], ) print("Loading trained adapter...") # Load the trained adapter weights adapter_path = "./gemma-3n_pois_med_plants" if os.path.exists(adapter_path): model.load_adapter(adapter_path, adapter_name="default") print("Adapter loaded successfully!") else: print(f"Warning: Model path {adapter_path} not found. Using base model.") # Enable inference mode with compilation disabled FastVisionModel.for_inference(model) # Ensure model is in eval mode model.eval() print("Model loaded successfully!") return True else: print("Unsloth not available, using base model only") # Fallback to base model without adapter from transformers import AutoModelForVision2Seq, AutoProcessor model = AutoModelForVision2Seq.from_pretrained( "google/gemma-3n-e2b-it", torch_dtype=torch.float16, device_map="auto" ) processor = AutoProcessor.from_pretrained("google/gemma-3n-e2b-it") model.eval() print("Base model loaded successfully!") return True except Exception as e: print(f"Error loading model: {str(e)}") return False # Load model on startup print("Initializing model...") model_loaded = load_model() def process_image_with_preset(image, identification_type="Plant"): """Process image with preset prompts""" global model, processor if model is None or processor is None: return "⚠️ Model not loaded. Please restart the application." if image is None: return "πŸ“Έ Please upload an image first." try: # Convert gradio image to PIL if needed if not isinstance(image, Image.Image): image = Image.fromarray(image) # Set instruction based on identification type with simple format if "Plant" in identification_type: instruction = "Identify this plant. Tell me its name, key features, and if it's safe or toxic." else: instruction = "Identify this insect. Tell me its name, key features, and if it's beneficial or harmful." # Prepare messages for the model messages = [ { "role": "user", "content": [{"type": "image"}, {"type": "text", "text": instruction}] } ] # Apply chat template input_text = processor.apply_chat_template(messages, add_generation_prompt=True) # Process inputs device = "cuda" if torch.cuda.is_available() else "cpu" inputs = processor( image, input_text, add_special_tokens=False, return_tensors="pt", ).to(device) # Generate response with simple parameters with torch.no_grad(): torch._dynamo.config.disable = True outputs = model.generate( **inputs, max_new_tokens=300, # Increased from 150 to allow complete responses use_cache=True, temperature=0.5, top_p=0.8, top_k=30, do_sample=True, repetition_penalty=1.1, pad_token_id=processor.tokenizer.eos_token_id, eos_token_id=processor.tokenizer.eos_token_id ) # Decode response generated_text = processor.decode(outputs[0], skip_special_tokens=True) print(f"Generated text: {generated_text}") # Extract assistant's response if "<|start_header_id|>assistant<|end_header_id|>" in generated_text: response = generated_text.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip() else: if input_text in generated_text: response = generated_text.replace(input_text, "").strip() else: response = generated_text # Clean up response - simple approach response = response.replace("<|eot_id|>", "").strip() if response.startswith("assistant"): response = response[9:].strip() # Basic cleaning - remove markdown and extra formatting response = response.replace('**', '').replace('*', '') response = response.replace('##', '').replace('#', '') # Remove empty lines and join, but preserve paragraph structure lines = [line.strip() for line in response.split('\n') if line.strip()] final_response = '\n'.join(lines) # Check if response seems incomplete (ends abruptly) if final_response and not final_response.endswith(('.', '!', '?', ':')): # If it doesn't end with proper punctuation, it might be cut off if len(final_response) > 50: # Only if it's a substantial response final_response += "..." # Add ellipsis to indicate truncation print(f"Final response: {final_response}") return final_response if final_response else "Unable to identify the specimen in the image. Please try with a clearer image." except Exception as e: return f"Error processing image: {str(e)}" def process_with_custom_image_with_prompt(image, custom_prompt): global model, processor if model is None or processor is None: return "⚠️ Model not loaded. Please restart the application." if image is None: return "πŸ“Έ Please upload an image first." if not custom_prompt.strip(): return "πŸ“ Please enter a prompt." try: # Convert gradio image to PIL if needed if not isinstance(image, Image.Image): image = Image.fromarray(image) # Prepare messages for the model messages = [ { "role": "user", "content": [{"type": "image"}, {"type": "text", "text": custom_prompt.strip()}] } ] # Apply chat template input_text = processor.apply_chat_template(messages, add_generation_prompt=True) # Process inputs device = "cuda" if torch.cuda.is_available() else "cpu" inputs = processor( image, input_text, add_special_tokens=False, return_tensors="pt", ).to(device) # Generate response with torch.no_grad(): torch._dynamo.config.disable = True outputs = model.generate( **inputs, max_new_tokens=512, use_cache=True, temperature=0.7, top_p=0.95, top_k=50, do_sample=True, pad_token_id=processor.tokenizer.eos_token_id, eos_token_id=processor.tokenizer.eos_token_id ) # Decode response generated_text = processor.decode(outputs[0], skip_special_tokens=True) # Extract assistant's response if "<|start_header_id|>assistant<|end_header_id|>" in generated_text: response = generated_text.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip() else: if input_text in generated_text: response = generated_text.replace(input_text, "").strip() else: response = generated_text # Clean up response response = response.replace("<|eot_id|>", "").strip() if response.startswith("assistant"): response = response[9:].strip() return response.strip() except Exception as e: return f"Error processing image: {str(e)}" def process_text_only(custom_prompt, location=""): """Process text-only prompt with optional location""" global model, processor if model is None or processor is None: return "⚠️ Model not loaded. Please restart the application." if not custom_prompt.strip(): return "πŸ“ Please enter a prompt." try: # Add location context if provided location_text = "" if location.strip(): location_text = f" The user is located in {location.strip()}." # Prepare the full prompt full_prompt = custom_prompt.strip() + location_text # Prepare messages for text-only processing messages = [ { "role": "user", "content": [{"type": "text", "text": full_prompt}] } ] # Apply chat template input_text = processor.apply_chat_template(messages, add_generation_prompt=True) # Process inputs (text only) device = "cuda" if torch.cuda.is_available() else "cpu" inputs = processor.tokenizer( input_text, return_tensors="pt", padding=True, truncation=True ).to(device) # Generate response with torch.no_grad(): torch._dynamo.config.disable = True outputs = model.generate( **inputs, max_new_tokens=512, use_cache=True, temperature=0.7, top_p=0.95, top_k=50, do_sample=True, pad_token_id=processor.tokenizer.eos_token_id, eos_token_id=processor.tokenizer.eos_token_id ) # Decode response generated_text = processor.tokenizer.decode(outputs[0], skip_special_tokens=True) # Extract assistant's response if "<|start_header_id|>assistant<|end_header_id|>" in generated_text: response = generated_text.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip() else: if input_text in generated_text: response = generated_text.replace(input_text, "").strip() else: response = generated_text # Clean up response response = response.replace("<|eot_id|>", "").strip() if response.startswith("assistant"): response = response[9:].strip() return response.strip() except Exception as e: return f"Error processing prompt: {str(e)}" def extract_and_translate_text(image, target_language="English"): """Extract text from image and translate to target language""" global model, processor if model is None or processor is None: return "⚠️ Model not loaded. Please restart the application." if image is None: return "πŸ“Έ Please upload an image first." try: # Convert gradio image to PIL if needed if not isinstance(image, Image.Image): image = Image.fromarray(image) # Create prompt for text extraction and translation prompt = f"""Extract all text from this image and translate it to {target_language}. Please provide: 1. Original Text: [All text found in the image] 2. Translated Text: [The text translated to {target_language}] 3. Language Detection: [What language the original text appears to be in] If no text is found, please indicate that no text was detected in the image.""" # Prepare messages for the model messages = [ { "role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt}] } ] # Apply chat template input_text = processor.apply_chat_template(messages, add_generation_prompt=True) # Process inputs device = "cuda" if torch.cuda.is_available() else "cpu" inputs = processor( image, input_text, add_special_tokens=False, return_tensors="pt", ).to(device) # Generate response with torch.no_grad(): torch._dynamo.config.disable = True outputs = model.generate( **inputs, max_new_tokens=512, use_cache=True, temperature=0.3, top_p=0.9, top_k=50, do_sample=True, pad_token_id=processor.tokenizer.eos_token_id, eos_token_id=processor.tokenizer.eos_token_id ) # Decode response generated_text = processor.decode(outputs[0], skip_special_tokens=True) # Extract assistant's response if "<|start_header_id|>assistant<|end_header_id|>" in generated_text: response = generated_text.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip() else: if input_text in generated_text: response = generated_text.replace(input_text, "").strip() else: response = generated_text # Clean up response response = response.replace("<|eot_id|>", "").strip() if response.startswith("assistant"): response = response[9:].strip() return response.strip() except Exception as e: return f"Error extracting and translating text: {str(e)}" def format_travel_response(response, subject): """Format travel response for better frontend consumption""" import json import re try: # Try to extract JSON from response json_match = re.search(r'\{.*\}', response, re.DOTALL) if json_match: json_str = json_match.group(0) # Clean up common formatting issues json_str = re.sub(r'```json\s*|\s*```', '', json_str) json_str = re.sub(r'^\s*json\s*', '', json_str) try: parsed_json = json.loads(json_str) return json.dumps(parsed_json, indent=2, ensure_ascii=False) except json.JSONDecodeError: pass except: pass # Fallback: return structured text if JSON parsing fails lines = response.strip().split('\n') clean_lines = [line.strip() for line in lines if line.strip()] return '\n'.join(clean_lines) def get_travel_info(subject, location): """Get travel and cultural information based on subject and location""" global model, processor if model is None or processor is None: return "⚠️ Model not loaded. Please restart the application." if not location.strip(): return "πŸ“ Please enter a location." if not subject: return "πŸ“ Please select a subject." try: # Define prompts based on subject prompts = { "Know about its history": f"""Provide a brief historical overview of {location} in JSON format: {{ "periods": [ {{"era": "Period Name", "description": "Brief description"}}, {{"era": "Period Name", "description": "Brief description"}}, {{"era": "Period Name", "description": "Brief description"}} ], "significance": "Why this history matters to tourists" }} Keep each description under 25 words.""", "People and culture": f"""Describe the people and culture of {location} in JSON format: {{ "people": "Brief description of local people", "etiquette_tip": "One important etiquette rule", "major_tradition": "Name and brief description of key tradition", "social_atmosphere": "Description of general vibe and atmosphere" }} Keep each field under 20 words.""", "Places to visit": f"""List must-visit places in {location} in JSON format: {{ "places": [ {{"name": "Place Name", "reason": "Why to visit"}}, {{"name": "Place Name", "reason": "Why to visit"}}, {{"name": "Place Name", "reason": "Why to visit"}}, {{"name": "Place Name", "reason": "Why to visit"}}, {{"name": "Place Name", "reason": "Why to visit"}} ] }} Keep each reason under 15 words.""", "Food to try out": f"""List iconic foods from {location} in JSON format: {{ "foods": [ {{"name": "Food Name", "description": "What it is and What makes it special"}}, {{"name": "Food Name", "description": "What it is and What makes it special"}}, {{"name": "Food Name", "description": "What it is and What makes it special"}}, {{"name": "Food Name", "description": "What it is and What makes it special"}}, {{"name": "Food Name", "description": "What it is and What makes it special"}} ] }} Keep each field under 12 words.""" } # Get the appropriate prompt prompt = prompts.get(subject, f"Tell me about {subject} in {location}.") # Prepare messages for text-only processing messages = [ { "role": "user", "content": [{"type": "text", "text": prompt}] } ] # Apply chat template input_text = processor.apply_chat_template(messages, add_generation_prompt=True) # Process inputs (text only) device = "cuda" if torch.cuda.is_available() else "cpu" inputs = processor.tokenizer( input_text, return_tensors="pt", padding=True, truncation=True ).to(device) # Generate response with torch.no_grad(): torch._dynamo.config.disable = True outputs = model.generate( **inputs, max_new_tokens=250, # Reduced from 512 for faster travel responses use_cache=True, temperature=0.3, # Lower temperature for more focused responses top_p=0.85, # Reduced for more focused responses top_k=30, # Reduced for faster generation do_sample=True, pad_token_id=processor.tokenizer.eos_token_id, eos_token_id=processor.tokenizer.eos_token_id, repetition_penalty=1.1 # Added to avoid repetition ) # Decode response generated_text = processor.tokenizer.decode(outputs[0], skip_special_tokens=True) # Extract assistant's response if "<|start_header_id|>assistant<|end_header_id|>" in generated_text: response = generated_text.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip() else: if input_text in generated_text: response = generated_text.replace(input_text, "").strip() else: response = generated_text # Clean up response response = response.replace("<|eot_id|>", "").strip() if response.startswith("assistant"): response = response[9:].strip() # Format the response for better frontend consumption formatted_response = format_travel_response(response, subject) return formatted_response except Exception as e: return f"Error getting travel information: {str(e)}" def generate_greeting(user_name, location): """Generate a personalized greeting message based on user's location""" global model, processor if model is None or processor is None: return "⚠️ Model not loaded. Please restart the application." if not user_name.strip(): return "πŸ‘‹ Please enter your name." if not location.strip(): return "πŸ“ Please enter your location." try: # Create a concise prompt for greeting generation prompt = f"""Generate a warm, engaging, and personalized greeting for {user_name.strip()} from {location.strip()}. Return ONLY a JSON response: {{ "greeting": "Creative, personalized welcome message that includes their name, location, and something interesting about their area or culture. Make it warm, unique, and memorable (max 40 words)" }} Make it friendly, culturally aware, and include interesting local references or cultural elements. Be creative and welcoming.""" # Prepare messages for text-only processing messages = [ { "role": "user", "content": [{"type": "text", "text": prompt}] } ] # Apply chat template input_text = processor.apply_chat_template(messages, add_generation_prompt=True) # Process inputs (text only) device = "cuda" if torch.cuda.is_available() else "cpu" inputs = processor.tokenizer( input_text, return_tensors="pt", padding=True, truncation=True ).to(device) # Generate response - optimized for speed with torch.no_grad(): torch._dynamo.config.disable = True outputs = model.generate( **inputs, max_new_tokens=100, # Very short for fast greetings use_cache=True, temperature=0.2, # Low temperature for consistent greetings top_p=0.8, # Focused generation top_k=20, # Even more focused for speed do_sample=True, pad_token_id=processor.tokenizer.eos_token_id, eos_token_id=processor.tokenizer.eos_token_id, repetition_penalty=1.1 ) # Decode response generated_text = processor.tokenizer.decode(outputs[0], skip_special_tokens=True) # Extract assistant's response if "<|start_header_id|>assistant<|end_header_id|>" in generated_text: response = generated_text.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip() else: if input_text in generated_text: response = generated_text.replace(input_text, "").strip() else: response = generated_text # Clean up response response = response.replace("<|eot_id|>", "").strip() if response.startswith("assistant"): response = response[9:].strip() print(f"Generated greeting: {response}") return response except Exception as e: return f"Error generating greeting: {str(e)}" # Simple, clean CSS focused on functionality custom_css = """ .gradio-container { max-width: 1200px; margin: 0 auto; padding: 20px; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; } .main-header { text-align: center; margin-bottom: 30px; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 10px; } .api-section { background: #f8f9fa; border-radius: 8px; padding: 20px; margin: 20px 0; border-left: 4px solid #007bff; } .gr-button { background: #007bff !important; border: none !important; border-radius: 6px !important; padding: 10px 20px !important; font-weight: 500 !important; } .gr-button:hover { background: #0056b3 !important; } .gr-textbox, .gr-radio { border-radius: 6px !important; border: 1px solid #ddd !important; } .gr-image { border-radius: 8px !important; border: 2px dashed #ddd !important; } .output-box { background: #f8f9fa !important; border-radius: 8px !important; border: 1px solid #e9ecef !important; min-height: 300px !important; } """ # Create the simplified Gradio interface with gr.Blocks(css=custom_css, title="GemmaYaatri API") as demo: gr.HTML("""

πŸ”¬ GemmaYaatri

""") with gr.Tabs(): # Tab 1: Preset Identification with gr.TabItem("🎯 Quick Identification"): gr.HTML("""

πŸ“‹ Preset Analysis

Upload an image and choose the identification type for structured analysis.

""") with gr.Row(): with gr.Column(scale=1): image_input_preset = gr.Image( label="Upload Image", type="pil", height=300 ) identification_type = gr.Radio( choices=["Plant", "Insect"], value="Plant", label="Identification Type" ) analyze_btn = gr.Button("πŸ” Analyze", variant="primary") with gr.Column(scale=1): output_preset = gr.Textbox( label="Analysis Results", lines=15, elem_classes=["output-box"], placeholder="Results will appear here..." ) # Tab 2: Text Translation with gr.TabItem("🌐 Text Translation"): gr.HTML("""

πŸ”€ Image Text Translation

Upload an image containing text and select a target language for translation.

""") with gr.Row(): with gr.Column(scale=1): image_input_custom = gr.Image( label="Upload Image with Text", type="pil", height=300 ) target_language = gr.Dropdown( choices=[ "English", "Spanish", "French", "German", "Italian", "Portuguese", "Dutch", "Russian", "Chinese", "Japanese", "Korean", "Arabic", "Hindi", "Bengali", "Tamil", "Telugu", "Marathi", "Gujarati", "Kannada", "Malayalam", "Punjabi", "Urdu", "Thai", "Vietnamese", "Indonesian", "Malay", "Turkish", "Greek", "Hebrew", "Polish", "Czech", "Hungarian", "Romanian", "Bulgarian", "Croatian", "Serbian", "Slovak", "Slovenian", "Estonian", "Latvian", "Lithuanian", "Finnish", "Swedish", "Norwegian", "Danish" ], value="English", label="Target Language", info="Select the language to translate the text to" ) translate_btn = gr.Button("πŸ”€ Extract & Translate", variant="primary") with gr.Column(scale=1): output_custom = gr.Textbox( label="Extracted & Translated Text", lines=15, elem_classes=["output-box"], placeholder="Extracted and translated text will appear here..." ) process_btn = gr.Button("οΏ½ Process", variant="primary") with gr.Column(scale=1): output_custom = gr.Textbox( label="Custom Analysis Results", lines=15, elem_classes=["output-box"], placeholder="Custom analysis results will appear here..." ) # Tab 3: Text-Only Query with gr.TabItem("πŸ’¬ Text Query"): gr.HTML("""

πŸ’­ Text-Only Analysis

Ask questions about plants, insects, or biological topics without uploading an image.

""") with gr.Row(): with gr.Column(scale=1): text_prompt = gr.Textbox( label="Your Question", lines=6, placeholder="Example: 'What are the medicinal properties of turmeric?' or 'Tell me about beneficial insects for organic farming...'", info="Ask any question about plants, insects, or biological topics" ) location_input = gr.Textbox( label="Location (Optional)", placeholder="e.g., California, USA or Kerala, India", info="Helps provide region-specific information" ) query_btn = gr.Button("πŸ’¬ Ask Question", variant="primary") with gr.Column(scale=1): output_text = gr.Textbox( label="Response", lines=15, elem_classes=["output-box"], placeholder="Your answer will appear here..." ) # Tab 4: Travel & Culture Guide with gr.TabItem("🌍 Travel Guide"): gr.HTML("""

πŸ—ΊοΈ Travel & Culture Information

Get expert insights about any location's history, culture, attractions, and local cuisine.

""") with gr.Row(): with gr.Column(scale=1): travel_location = gr.Textbox( label="Location", placeholder="e.g., Paris, France or Kyoto, Japan or Kerala, India", info="Enter any city, region, or country" ) travel_subject = gr.Radio( choices=[ "Know about its history", "People and culture", "Places to visit", "Food to try out" ], value="Places to visit", label="What would you like to know?", info="Select the type of information you want" ) travel_btn = gr.Button("πŸ—ΊοΈ Get Travel Info", variant="primary") with gr.Column(scale=1): output_travel = gr.Textbox( label="Travel Information", lines=15, elem_classes=["output-box"], placeholder="Travel and cultural information will appear here..." ) # Tab 5: Personal Greeting with gr.TabItem("πŸ‘‹ Personal Greeting"): gr.HTML("""

🌟 Personalized Welcome

Get a warm, culturally appropriate greeting message tailored to your location.

""") with gr.Row(): with gr.Column(scale=1): user_name = gr.Textbox( label="Your Name", placeholder="e.g., Alex, Maria, or Priya", info="Enter your first name" ) user_location = gr.Textbox( label="Your Location", placeholder="e.g., Mumbai, India or New York, USA", info="Enter your city and country" ) greeting_btn = gr.Button("πŸ‘‹ Generate Greeting", variant="primary") with gr.Column(scale=1): output_greeting = gr.Textbox( label="Personal Greeting", lines=8, elem_classes=["output-box"], placeholder="Your personalized greeting will appear here..." ) # Event handlers analyze_btn.click( fn=process_image_with_preset, inputs=[image_input_preset, identification_type], outputs=output_preset ) translate_btn.click( fn=extract_and_translate_text, inputs=[image_input_custom, target_language], outputs=output_custom ) query_btn.click( fn=process_text_only, inputs=[text_prompt, location_input], outputs=output_text ) travel_btn.click( fn=get_travel_info, inputs=[travel_subject, travel_location], outputs=output_travel ) greeting_btn.click( fn=generate_greeting, inputs=[user_name, user_location], outputs=output_greeting ) # Auto-process on image upload for preset image_input_preset.change( fn=process_image_with_preset, inputs=[image_input_preset, identification_type], outputs=output_preset ) # Footer gr.HTML("""

πŸš€ GemmaYaatri Vision API | Powered by Gemma-3n-E2B-it

For educational and research purposes

""") # Launch the app if __name__ == "__main__": if not model_loaded: print("Failed to load model. Please check the model path and try again.") # Check if running on HuggingFace Spaces import os is_hf_space = os.getenv('SPACE_ID') is not None if is_hf_space: # HuggingFace Spaces configuration demo.launch( server_name="0.0.0.0", server_port=7860, share=False, # Don't use share on HF Spaces debug=False # Disable debug on production ) else: # Local development configuration demo.launch( server_name="127.0.0.1", server_port=7860, share=True, debug=True )