import gradio as gr import torch from PIL import Image import json import os from transformers import AutoProcessor, AutoModelForImageTextToText from typing import List, Dict, Any import logging import spaces # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Model configuration MODEL_ID = "Tonic/l-android-control" DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # Get Hugging Face token from environment variable (Spaces secrets) import os HF_TOKEN = os.getenv("HF_TOKEN") if not HF_TOKEN: logger.warning("HF_TOKEN not found in environment variables. Model access may be restricted.") class LOperatorDemo: def __init__(self): self.model = None self.processor = None self.is_loaded = False def load_model(self): """Load the L-Operator model and processor""" try: logger.info(f"Loading model {MODEL_ID} on device {DEVICE}") # Check if token is available if not HF_TOKEN: return "❌ HF_TOKEN not found. Please set HF_TOKEN in Spaces secrets." # Load processor with token self.processor = AutoProcessor.from_pretrained( MODEL_ID, trust_remote_code=True, token=HF_TOKEN ) # Load model with token self.model = AutoModelForImageTextToText.from_pretrained( MODEL_ID, torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32, trust_remote_code=True, device_map="auto" if DEVICE == "cuda" else None, token=HF_TOKEN ) if DEVICE == "cpu": self.model = self.model.to(DEVICE) self.is_loaded = True logger.info("Model loaded successfully with token authentication") return "✅ Model loaded successfully with token authentication!" except Exception as e: logger.error(f"Error loading model: {str(e)}") return f"❌ Error loading model: {str(e)}" @spaces.GPU(duration=120) # 2 minutes for action generation def generate_action(self, image: Image.Image, goal: str, instruction: str) -> str: """Generate action based on image and text inputs""" if not self.is_loaded: return "❌ Model not loaded. Please load the model first." try: # Convert image to RGB if needed if image.mode != "RGB": image = image.convert("RGB") # Build conversation conversation = [ { "role": "system", "content": [ {"type": "text", "text": "You are a helpful multimodal assistant by Liquid AI."} ] }, { "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": f"Goal: {goal}\nStep: {instruction}\nRespond with a JSON action containing relevant keys (e.g., action_type, x, y, text, app_name, direction)."} ] } ] # Process inputs inputs = self.processor.apply_chat_template( conversation, add_generation_prompt=True, return_tensors="pt" ).to(self.model.device) # Generate response with torch.no_grad(): outputs = self.model.generate( inputs, max_new_tokens=128, do_sample=True, temperature=0.7, top_p=0.9 ) response = self.processor.tokenizer.decode( outputs[0][inputs.shape[1]:], skip_special_tokens=True ) # Try to parse as JSON for better formatting try: parsed_response = json.loads(response) return json.dumps(parsed_response, indent=2) except: return response except Exception as e: logger.error(f"Error generating action: {str(e)}") return f"❌ Error generating action: {str(e)}" @spaces.GPU(duration=90) # 1.5 minutes for chat responses def chat_with_model(self, message: str, history: List[List[str]], image: Image.Image = None) -> tuple: """Chat interface function for Gradio""" if not self.is_loaded: return history + [[message, "❌ Model not loaded. Please load the model first."]] if image is None: return history + [[message, "❌ Please upload an Android screenshot image."]] try: # Extract goal and instruction from message if "Goal:" in message and "Step:" in message: # Parse structured input lines = message.split('\n') goal = "" instruction = "" for line in lines: if line.startswith("Goal:"): goal = line.replace("Goal:", "").strip() elif line.startswith("Step:"): instruction = line.replace("Step:", "").strip() if not goal or not instruction: return history + [[message, "❌ Please provide both Goal and Step in your message."]] else: # Treat as general instruction goal = "Complete the requested action" instruction = message # Generate action response = self.generate_action(image, goal, instruction) return history + [[message, response]] except Exception as e: logger.error(f"Error in chat: {str(e)}") return history + [[message, f"❌ Error: {str(e)}"]] # Initialize demo demo_instance = LOperatorDemo() # Load example episodes def load_example_episodes(): """Load example episodes from the extracted data""" examples = [] try: # Load episode 13 with open("extracted_episodes_duckdb/episode_13/metadata.json", "r") as f: episode_13 = json.load(f) # Load episode 53 with open("extracted_episodes_duckdb/episode_53/metadata.json", "r") as f: episode_53 = json.load(f) # Load episode 73 with open("extracted_episodes_duckdb/episode_73/metadata.json", "r") as f: episode_73 = json.load(f) # Create examples examples = [ [ "extracted_episodes_duckdb/episode_13/screenshots/screenshot_1.png", f"Goal: {episode_13['goal']}\nStep: {episode_13['step_instructions'][0]}" ], [ "extracted_episodes_duckdb/episode_53/screenshots/screenshot_1.png", f"Goal: {episode_53['goal']}\nStep: {episode_53['step_instructions'][0]}" ], [ "extracted_episodes_duckdb/episode_73/screenshots/screenshot_1.png", f"Goal: {episode_73['goal']}\nStep: {episode_73['step_instructions'][0]}" ] ] except Exception as e: logger.error(f"Error loading examples: {str(e)}") examples = [] return examples # Create Gradio interface def create_demo(): """Create the Gradio demo interface""" with gr.Blocks( title="L-Operator: Android Device Control Demo", theme=gr.themes.Soft(), css=""" .gradio-container { max-width: 1200px !important; } .chat-container { height: 600px; } """ ) as demo: gr.Markdown(""" # 🤖 L-Operator: Android Device Control Demo **Lightweight Multimodal Android Device Control Agent** This demo showcases the L-Operator model, a fine-tuned multimodal AI agent based on LiquidAI's LFM2-VL-1.6B model, optimized for Android device control through visual understanding and action generation. ## 🚀 How to Use 1. **Load the Model**: Click the "Load Model" button to initialize the L-Operator model 2. **Upload Screenshot**: Upload an Android device screenshot 3. **Provide Instructions**: Enter your goal and step instructions 4. **Get Actions**: The model will generate JSON actions for Android device control ## 📋 Expected Output Format The model generates JSON actions in the following format: ```json { "action_type": "tap", "x": 540, "y": 1200, "text": "Settings", "app_name": "com.android.settings", "confidence": 0.92 } ``` --- """) with gr.Row(): with gr.Column(scale=1): gr.Markdown("### 🔧 Model Control") load_btn = gr.Button("🚀 Load L-Operator Model", variant="primary", size="lg") load_status = gr.Textbox(label="Model Status", value="❌ Model not loaded", interactive=False) # ZeroGPU status indicator if ZEROGPU_AVAILABLE: gr.Markdown("### ⚡ ZeroGPU Status") gr.Markdown("🟢 **ZeroGPU Enabled**: Dynamic GPU allocation for cost-effective inference") else: gr.Markdown("### ⚡ ZeroGPU Status") gr.Markdown("🟡 **ZeroGPU Not Available**: Running in standard mode") # Token status indicator if HF_TOKEN: gr.Markdown("### 🔐 Authentication Status") gr.Markdown("🟢 **Token Available**: HF_TOKEN found in environment") else: gr.Markdown("### 🔐 Authentication Status") gr.Markdown("🟡 **Token Missing**: HF_TOKEN not found - set in Spaces secrets") gr.Markdown("### 📱 Input") image_input = gr.Image( label="Android Screenshot", type="pil", height=400, tool="upload" ) gr.Markdown("### 📝 Instructions") goal_input = gr.Textbox( label="Goal", placeholder="e.g., Open the Settings app and navigate to Display settings", lines=2 ) step_input = gr.Textbox( label="Step Instruction", placeholder="e.g., Tap on the Settings app icon on the home screen", lines=2 ) generate_btn = gr.Button("🎯 Generate Action", variant="secondary") with gr.Column(scale=2): gr.Markdown("### 💬 Chat Interface") chat_interface = gr.ChatInterface( fn=demo_instance.chat_with_model, additional_inputs=[image_input], title="L-Operator Chat", description="Chat with L-Operator using screenshots and text instructions", examples=load_example_episodes(), retry_btn="🔄 Retry", undo_btn="↩️ Undo", clear_btn="🗑️ Clear", height=600 ) gr.Markdown("### 🎯 Action Output") action_output = gr.JSON( label="Generated Action", value={}, height=200 ) # Event handlers def on_load_model(): return demo_instance.load_model() def on_generate_action(image, goal, step): if not image: return {"error": "Please upload an image"} if not goal or not step: return {"error": "Please provide both goal and step"} response = demo_instance.generate_action(image, goal, step) try: # Try to parse as JSON parsed = json.loads(response) return parsed except: return {"raw_response": response} load_btn.click( fn=on_load_model, outputs=load_status ) generate_btn.click( fn=on_generate_action, inputs=[image_input, goal_input, step_input], outputs=action_output ) # Update chat interface when image changes def update_chat_image(image): return image image_input.change( fn=update_chat_image, inputs=[image_input], outputs=[chat_interface.chatbot] ) gr.Markdown(""" --- ## 📊 Model Details | Property | Value | |----------|-------| | **Base Model** | LiquidAI/LFM2-VL-1.6B | | **Architecture** | LFM2-VL (1.6B parameters) | | **Fine-tuning** | LoRA (Low-Rank Adaptation) | | **Training Data** | Android control episodes with screenshots and actions | ## 🎯 Use Cases - **Mobile App Testing**: Automated UI testing for Android applications - **Accessibility Applications**: Voice-controlled device navigation - **Remote Support**: Remote device troubleshooting - **Development Workflows**: UI/UX testing automation ## ⚡ ZeroGPU Integration This demo is optimized for [Hugging Face Spaces ZeroGPU](https://huggingface.co/docs/hub/spaces-zerogpu), providing: - **Dynamic GPU Allocation**: NVIDIA H200 GPUs allocated on-demand - **Cost Efficiency**: Free GPU access with optimized resource utilization - **Multi-GPU Support**: Leverage multiple GPUs concurrently - **Automatic Management**: GPU resources released after function completion ### ZeroGPU Specifications - **GPU Type**: NVIDIA H200 slice - **Available VRAM**: 70GB per workload - **Supported Versions**: Gradio 4+, PyTorch 2.1.2/2.2.2/2.4.0/2.5.1, Python 3.10.13 ## ⚠️ Important Notes - This model requires authentication with Hugging Face - Access is restricted to qualified investors under NDA - For investment evaluation purposes only - Model size: ~1.6B parameters, optimized for real-time use - **Token Authentication**: HF_TOKEN must be set in Spaces secrets for model access --- **Made with ❤️ by Tonic** | [Model on Hugging Face](https://huggingface.co/Tonic/l-android-control) | [ZeroGPU Documentation](https://huggingface.co/docs/hub/spaces-zerogpu) """) return demo # Create and launch the demo if __name__ == "__main__": demo = create_demo() demo.launch( server_name="0.0.0.0", server_port=7860, share=False, debug=True, show_error=True, ssr_mode=False )