import gradio as gr import torch try: from unsloth import FastVisionModel UNSLOTH_AVAILABLE = True except (ImportError, NotImplementedError): print("Unsloth not available or not supported, falling back to standard transformers") from transformers import AutoModelForVision2Seq, AutoProcessor UNSLOTH_AVAILABLE = False from PIL import Image import os import warnings # Disable torch compilation to avoid inductor backend issues if hasattr(torch, '_dynamo'): torch._dynamo.config.disable = True if hasattr(torch.backends, 'cudnn'): torch.backends.cudnn.allow_tf32 = True if hasattr(torch.backends, 'cuda'): torch.backends.cuda.matmul.allow_tf32 = True # Suppress warnings warnings.filterwarnings('ignore') # Global variables for model and processor model = None processor = None def load_model(): """Load the trained plant model""" global model, processor try: print("Loading base model...") if UNSLOTH_AVAILABLE: print("Using Unsloth FastVisionModel...") # Disable torch compilation for inference if hasattr(torch, '_dynamo'): torch._dynamo.reset() # Load the base model with unsloth model, processor = FastVisionModel.from_pretrained( "unsloth/gemma-3n-e2b-it-unsloth-bnb-4bit", load_in_4bit=True, use_gradient_checkpointing="unsloth", ) print("Setting up LoRA configuration...") # Set up LoRA configuration to match training model = FastVisionModel.get_peft_model( model, finetune_vision_layers=True, finetune_language_layers=True, finetune_attention_modules=True, finetune_mlp_modules=True, r=16, lora_alpha=16, lora_dropout=0, bias="none", random_state=3407, use_rslora=False, loftq_config=None, target_modules="all-linear", modules_to_save=[ "lm_head", "embed_tokens", ], ) print("Loading trained adapter...") # Load the trained adapter weights adapter_path = "./gemma-3n_pois_med_plants" if os.path.exists(adapter_path): model.load_adapter(adapter_path, adapter_name="default") print("Adapter loaded successfully!") else: print(f"Warning: Model path {adapter_path} not found. Using base model.") # Enable inference mode with compilation disabled FastVisionModel.for_inference(model) # Ensure model is in eval mode model.eval() print("Model loaded successfully!") return True else: print("Unsloth not available, using base model only") # Fallback to base model without adapter from transformers import AutoModelForVision2Seq, AutoProcessor model = AutoModelForVision2Seq.from_pretrained( "google/gemma-3n-e2b-it", torch_dtype=torch.float16, device_map="auto" ) processor = AutoProcessor.from_pretrained("google/gemma-3n-e2b-it") model.eval() print("Base model loaded successfully!") return True except Exception as e: print(f"Error loading model: {str(e)}") return False # Load model on startup print("Initializing model...") model_loaded = load_model() def process_image_with_preset(image, identification_type="Plant"): """Process image with preset prompts""" global model, processor if model is None or processor is None: return "β οΈ Model not loaded. Please restart the application." if image is None: return "πΈ Please upload an image first." try: # Convert gradio image to PIL if needed if not isinstance(image, Image.Image): image = Image.fromarray(image) # Set instruction based on identification type with simple format if "Plant" in identification_type: instruction = "Identify this plant. Tell me its name, key features, and if it's safe or toxic." else: instruction = "Identify this insect. Tell me its name, key features, and if it's beneficial or harmful." # Prepare messages for the model messages = [ { "role": "user", "content": [{"type": "image"}, {"type": "text", "text": instruction}] } ] # Apply chat template input_text = processor.apply_chat_template(messages, add_generation_prompt=True) # Process inputs device = "cuda" if torch.cuda.is_available() else "cpu" inputs = processor( image, input_text, add_special_tokens=False, return_tensors="pt", ).to(device) # Generate response with simple parameters with torch.no_grad(): torch._dynamo.config.disable = True outputs = model.generate( **inputs, max_new_tokens=300, # Increased from 150 to allow complete responses use_cache=True, temperature=0.5, top_p=0.8, top_k=30, do_sample=True, repetition_penalty=1.1, pad_token_id=processor.tokenizer.eos_token_id, eos_token_id=processor.tokenizer.eos_token_id ) # Decode response generated_text = processor.decode(outputs[0], skip_special_tokens=True) print(f"Generated text: {generated_text}") # Extract assistant's response if "<|start_header_id|>assistant<|end_header_id|>" in generated_text: response = generated_text.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip() else: if input_text in generated_text: response = generated_text.replace(input_text, "").strip() else: response = generated_text # Clean up response - simple approach response = response.replace("<|eot_id|>", "").strip() if response.startswith("assistant"): response = response[9:].strip() # Basic cleaning - remove markdown and extra formatting response = response.replace('**', '').replace('*', '') response = response.replace('##', '').replace('#', '') # Remove empty lines and join, but preserve paragraph structure lines = [line.strip() for line in response.split('\n') if line.strip()] final_response = '\n'.join(lines) # Check if response seems incomplete (ends abruptly) if final_response and not final_response.endswith(('.', '!', '?', ':')): # If it doesn't end with proper punctuation, it might be cut off if len(final_response) > 50: # Only if it's a substantial response final_response += "..." # Add ellipsis to indicate truncation print(f"Final response: {final_response}") return final_response if final_response else "Unable to identify the specimen in the image. Please try with a clearer image." except Exception as e: return f"Error processing image: {str(e)}" def process_with_custom_image_with_prompt(image, custom_prompt): global model, processor if model is None or processor is None: return "β οΈ Model not loaded. Please restart the application." if image is None: return "πΈ Please upload an image first." if not custom_prompt.strip(): return "π Please enter a prompt." try: # Convert gradio image to PIL if needed if not isinstance(image, Image.Image): image = Image.fromarray(image) # Prepare messages for the model messages = [ { "role": "user", "content": [{"type": "image"}, {"type": "text", "text": custom_prompt.strip()}] } ] # Apply chat template input_text = processor.apply_chat_template(messages, add_generation_prompt=True) # Process inputs device = "cuda" if torch.cuda.is_available() else "cpu" inputs = processor( image, input_text, add_special_tokens=False, return_tensors="pt", ).to(device) # Generate response with torch.no_grad(): torch._dynamo.config.disable = True outputs = model.generate( **inputs, max_new_tokens=512, use_cache=True, temperature=0.7, top_p=0.95, top_k=50, do_sample=True, pad_token_id=processor.tokenizer.eos_token_id, eos_token_id=processor.tokenizer.eos_token_id ) # Decode response generated_text = processor.decode(outputs[0], skip_special_tokens=True) # Extract assistant's response if "<|start_header_id|>assistant<|end_header_id|>" in generated_text: response = generated_text.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip() else: if input_text in generated_text: response = generated_text.replace(input_text, "").strip() else: response = generated_text # Clean up response response = response.replace("<|eot_id|>", "").strip() if response.startswith("assistant"): response = response[9:].strip() return response.strip() except Exception as e: return f"Error processing image: {str(e)}" def process_text_only(custom_prompt, location=""): """Process text-only prompt with optional location""" global model, processor if model is None or processor is None: return "β οΈ Model not loaded. Please restart the application." if not custom_prompt.strip(): return "π Please enter a prompt." try: # Add location context if provided location_text = "" if location.strip(): location_text = f" The user is located in {location.strip()}." # Prepare the full prompt full_prompt = custom_prompt.strip() + location_text # Prepare messages for text-only processing messages = [ { "role": "user", "content": [{"type": "text", "text": full_prompt}] } ] # Apply chat template input_text = processor.apply_chat_template(messages, add_generation_prompt=True) # Process inputs (text only) device = "cuda" if torch.cuda.is_available() else "cpu" inputs = processor.tokenizer( input_text, return_tensors="pt", padding=True, truncation=True ).to(device) # Generate response with torch.no_grad(): torch._dynamo.config.disable = True outputs = model.generate( **inputs, max_new_tokens=512, use_cache=True, temperature=0.7, top_p=0.95, top_k=50, do_sample=True, pad_token_id=processor.tokenizer.eos_token_id, eos_token_id=processor.tokenizer.eos_token_id ) # Decode response generated_text = processor.tokenizer.decode(outputs[0], skip_special_tokens=True) # Extract assistant's response if "<|start_header_id|>assistant<|end_header_id|>" in generated_text: response = generated_text.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip() else: if input_text in generated_text: response = generated_text.replace(input_text, "").strip() else: response = generated_text # Clean up response response = response.replace("<|eot_id|>", "").strip() if response.startswith("assistant"): response = response[9:].strip() return response.strip() except Exception as e: return f"Error processing prompt: {str(e)}" def extract_and_translate_text(image, target_language="English"): """Extract text from image and translate to target language""" global model, processor if model is None or processor is None: return "β οΈ Model not loaded. Please restart the application." if image is None: return "πΈ Please upload an image first." try: # Convert gradio image to PIL if needed if not isinstance(image, Image.Image): image = Image.fromarray(image) # Create prompt for text extraction and translation prompt = f"""Extract all text from this image and translate it to {target_language}. Please provide: 1. Original Text: [All text found in the image] 2. Translated Text: [The text translated to {target_language}] 3. Language Detection: [What language the original text appears to be in] If no text is found, please indicate that no text was detected in the image.""" # Prepare messages for the model messages = [ { "role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt}] } ] # Apply chat template input_text = processor.apply_chat_template(messages, add_generation_prompt=True) # Process inputs device = "cuda" if torch.cuda.is_available() else "cpu" inputs = processor( image, input_text, add_special_tokens=False, return_tensors="pt", ).to(device) # Generate response with torch.no_grad(): torch._dynamo.config.disable = True outputs = model.generate( **inputs, max_new_tokens=512, use_cache=True, temperature=0.3, top_p=0.9, top_k=50, do_sample=True, pad_token_id=processor.tokenizer.eos_token_id, eos_token_id=processor.tokenizer.eos_token_id ) # Decode response generated_text = processor.decode(outputs[0], skip_special_tokens=True) # Extract assistant's response if "<|start_header_id|>assistant<|end_header_id|>" in generated_text: response = generated_text.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip() else: if input_text in generated_text: response = generated_text.replace(input_text, "").strip() else: response = generated_text # Clean up response response = response.replace("<|eot_id|>", "").strip() if response.startswith("assistant"): response = response[9:].strip() return response.strip() except Exception as e: return f"Error extracting and translating text: {str(e)}" def format_travel_response(response, subject): """Format travel response for better frontend consumption""" import json import re try: # Try to extract JSON from response json_match = re.search(r'\{.*\}', response, re.DOTALL) if json_match: json_str = json_match.group(0) # Clean up common formatting issues json_str = re.sub(r'```json\s*|\s*```', '', json_str) json_str = re.sub(r'^\s*json\s*', '', json_str) try: parsed_json = json.loads(json_str) return json.dumps(parsed_json, indent=2, ensure_ascii=False) except json.JSONDecodeError: pass except: pass # Fallback: return structured text if JSON parsing fails lines = response.strip().split('\n') clean_lines = [line.strip() for line in lines if line.strip()] return '\n'.join(clean_lines) def get_travel_info(subject, location): """Get travel and cultural information based on subject and location""" global model, processor if model is None or processor is None: return "β οΈ Model not loaded. Please restart the application." if not location.strip(): return "π Please enter a location." if not subject: return "π Please select a subject." try: # Define prompts based on subject prompts = { "Know about its history": f"""Provide a brief historical overview of {location} in JSON format: {{ "periods": [ {{"era": "Period Name", "description": "Brief description"}}, {{"era": "Period Name", "description": "Brief description"}}, {{"era": "Period Name", "description": "Brief description"}} ], "significance": "Why this history matters to tourists" }} Keep each description under 25 words.""", "People and culture": f"""Describe the people and culture of {location} in JSON format: {{ "people": "Brief description of local people", "etiquette_tip": "One important etiquette rule", "major_tradition": "Name and brief description of key tradition", "social_atmosphere": "Description of general vibe and atmosphere" }} Keep each field under 20 words.""", "Places to visit": f"""List must-visit places in {location} in JSON format: {{ "places": [ {{"name": "Place Name", "reason": "Why to visit"}}, {{"name": "Place Name", "reason": "Why to visit"}}, {{"name": "Place Name", "reason": "Why to visit"}}, {{"name": "Place Name", "reason": "Why to visit"}}, {{"name": "Place Name", "reason": "Why to visit"}} ] }} Keep each reason under 15 words.""", "Food to try out": f"""List iconic foods from {location} in JSON format: {{ "foods": [ {{"name": "Food Name", "description": "What it is and What makes it special"}}, {{"name": "Food Name", "description": "What it is and What makes it special"}}, {{"name": "Food Name", "description": "What it is and What makes it special"}}, {{"name": "Food Name", "description": "What it is and What makes it special"}}, {{"name": "Food Name", "description": "What it is and What makes it special"}} ] }} Keep each field under 12 words.""" } # Get the appropriate prompt prompt = prompts.get(subject, f"Tell me about {subject} in {location}.") # Prepare messages for text-only processing messages = [ { "role": "user", "content": [{"type": "text", "text": prompt}] } ] # Apply chat template input_text = processor.apply_chat_template(messages, add_generation_prompt=True) # Process inputs (text only) device = "cuda" if torch.cuda.is_available() else "cpu" inputs = processor.tokenizer( input_text, return_tensors="pt", padding=True, truncation=True ).to(device) # Generate response with torch.no_grad(): torch._dynamo.config.disable = True outputs = model.generate( **inputs, max_new_tokens=250, # Reduced from 512 for faster travel responses use_cache=True, temperature=0.3, # Lower temperature for more focused responses top_p=0.85, # Reduced for more focused responses top_k=30, # Reduced for faster generation do_sample=True, pad_token_id=processor.tokenizer.eos_token_id, eos_token_id=processor.tokenizer.eos_token_id, repetition_penalty=1.1 # Added to avoid repetition ) # Decode response generated_text = processor.tokenizer.decode(outputs[0], skip_special_tokens=True) # Extract assistant's response if "<|start_header_id|>assistant<|end_header_id|>" in generated_text: response = generated_text.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip() else: if input_text in generated_text: response = generated_text.replace(input_text, "").strip() else: response = generated_text # Clean up response response = response.replace("<|eot_id|>", "").strip() if response.startswith("assistant"): response = response[9:].strip() # Format the response for better frontend consumption formatted_response = format_travel_response(response, subject) return formatted_response except Exception as e: return f"Error getting travel information: {str(e)}" def generate_greeting(user_name, location): """Generate a personalized greeting message based on user's location""" global model, processor if model is None or processor is None: return "β οΈ Model not loaded. Please restart the application." if not user_name.strip(): return "π Please enter your name." if not location.strip(): return "π Please enter your location." try: # Create a concise prompt for greeting generation prompt = f"""Generate a warm, engaging, and personalized greeting for {user_name.strip()} from {location.strip()}. Return ONLY a JSON response: {{ "greeting": "Creative, personalized welcome message that includes their name, location, and something interesting about their area or culture. Make it warm, unique, and memorable (max 40 words)" }} Make it friendly, culturally aware, and include interesting local references or cultural elements. Be creative and welcoming.""" # Prepare messages for text-only processing messages = [ { "role": "user", "content": [{"type": "text", "text": prompt}] } ] # Apply chat template input_text = processor.apply_chat_template(messages, add_generation_prompt=True) # Process inputs (text only) device = "cuda" if torch.cuda.is_available() else "cpu" inputs = processor.tokenizer( input_text, return_tensors="pt", padding=True, truncation=True ).to(device) # Generate response - optimized for speed with torch.no_grad(): torch._dynamo.config.disable = True outputs = model.generate( **inputs, max_new_tokens=100, # Very short for fast greetings use_cache=True, temperature=0.2, # Low temperature for consistent greetings top_p=0.8, # Focused generation top_k=20, # Even more focused for speed do_sample=True, pad_token_id=processor.tokenizer.eos_token_id, eos_token_id=processor.tokenizer.eos_token_id, repetition_penalty=1.1 ) # Decode response generated_text = processor.tokenizer.decode(outputs[0], skip_special_tokens=True) # Extract assistant's response if "<|start_header_id|>assistant<|end_header_id|>" in generated_text: response = generated_text.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip() else: if input_text in generated_text: response = generated_text.replace(input_text, "").strip() else: response = generated_text # Clean up response response = response.replace("<|eot_id|>", "").strip() if response.startswith("assistant"): response = response[9:].strip() print(f"Generated greeting: {response}") return response except Exception as e: return f"Error generating greeting: {str(e)}" # Simple, clean CSS focused on functionality custom_css = """ .gradio-container { max-width: 1200px; margin: 0 auto; padding: 20px; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; } .main-header { text-align: center; margin-bottom: 30px; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 10px; } .api-section { background: #f8f9fa; border-radius: 8px; padding: 20px; margin: 20px 0; border-left: 4px solid #007bff; } .gr-button { background: #007bff !important; border: none !important; border-radius: 6px !important; padding: 10px 20px !important; font-weight: 500 !important; } .gr-button:hover { background: #0056b3 !important; } .gr-textbox, .gr-radio { border-radius: 6px !important; border: 1px solid #ddd !important; } .gr-image { border-radius: 8px !important; border: 2px dashed #ddd !important; } .output-box { background: #f8f9fa !important; border-radius: 8px !important; border: 1px solid #e9ecef !important; min-height: 300px !important; } """ # Create the simplified Gradio interface with gr.Blocks(css=custom_css, title="GemmaYaatri API") as demo: gr.HTML("""
Upload an image and choose the identification type for structured analysis.
Upload an image containing text and select a target language for translation.
Ask questions about plants, insects, or biological topics without uploading an image.
Get expert insights about any location's history, culture, attractions, and local cuisine.
Get a warm, culturally appropriate greeting message tailored to your location.
π GemmaYaatri Vision API | Powered by Gemma-3n-E2B-it
For educational and research purposes