Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| import torch | |
| from PIL import Image | |
| import json | |
| import os | |
| from transformers import AutoProcessor, AutoModelForImageTextToText | |
| from typing import List, Dict, Any | |
| import logging | |
| import spaces | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # Model configuration | |
| MODEL_ID = "Tonic/l-android-control" | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| # Get Hugging Face token from environment variable (Spaces secrets) | |
| import os | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| if not HF_TOKEN: | |
| logger.warning("HF_TOKEN not found in environment variables. Model access may be restricted.") | |
| class LOperatorDemo: | |
| def __init__(self): | |
| self.model = None | |
| self.processor = None | |
| self.is_loaded = False | |
| def load_model(self): | |
| """Load the L-Operator model and processor""" | |
| try: | |
| logger.info(f"Loading model {MODEL_ID} on device {DEVICE}") | |
| # Check if token is available | |
| if not HF_TOKEN: | |
| return "β HF_TOKEN not found. Please set HF_TOKEN in Spaces secrets." | |
| # Load processor with token | |
| self.processor = AutoProcessor.from_pretrained( | |
| MODEL_ID, | |
| trust_remote_code=True, | |
| token=HF_TOKEN | |
| ) | |
| # Load model with token | |
| self.model = AutoModelForImageTextToText.from_pretrained( | |
| MODEL_ID, | |
| torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32, | |
| trust_remote_code=True, | |
| device_map="auto" if DEVICE == "cuda" else None, | |
| token=HF_TOKEN | |
| ) | |
| if DEVICE == "cpu": | |
| self.model = self.model.to(DEVICE) | |
| self.is_loaded = True | |
| logger.info("Model loaded successfully with token authentication") | |
| return "β Model loaded successfully with token authentication!" | |
| except Exception as e: | |
| logger.error(f"Error loading model: {str(e)}") | |
| return f"β Error loading model: {str(e)}" | |
| # 2 minutes for action generation | |
| def generate_action(self, image: Image.Image, goal: str, instruction: str) -> str: | |
| """Generate action based on image and text inputs""" | |
| if not self.is_loaded: | |
| return "β Model not loaded. Please load the model first." | |
| try: | |
| # Convert image to RGB if needed | |
| if image.mode != "RGB": | |
| image = image.convert("RGB") | |
| # Build conversation | |
| conversation = [ | |
| { | |
| "role": "system", | |
| "content": [ | |
| {"type": "text", "text": "You are a helpful multimodal assistant by Liquid AI."} | |
| ] | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "image": image}, | |
| {"type": "text", "text": f"Goal: {goal}\nStep: {instruction}\nRespond with a JSON action containing relevant keys (e.g., action_type, x, y, text, app_name, direction)."} | |
| ] | |
| } | |
| ] | |
| # Process inputs | |
| inputs = self.processor.apply_chat_template( | |
| conversation, | |
| add_generation_prompt=True, | |
| return_tensors="pt" | |
| ).to(self.model.device) | |
| # Generate response | |
| with torch.no_grad(): | |
| outputs = self.model.generate( | |
| inputs, | |
| max_new_tokens=128, | |
| do_sample=True, | |
| temperature=0.7, | |
| top_p=0.9 | |
| ) | |
| response = self.processor.tokenizer.decode( | |
| outputs[0][inputs.shape[1]:], | |
| skip_special_tokens=True | |
| ) | |
| # Try to parse as JSON for better formatting | |
| try: | |
| parsed_response = json.loads(response) | |
| return json.dumps(parsed_response, indent=2) | |
| except: | |
| return response | |
| except Exception as e: | |
| logger.error(f"Error generating action: {str(e)}") | |
| return f"β Error generating action: {str(e)}" | |
| # 1.5 minutes for chat responses | |
| def chat_with_model(self, message: str, history: List[List[str]], image: Image.Image = None) -> tuple: | |
| """Chat interface function for Gradio""" | |
| if not self.is_loaded: | |
| return history + [[message, "β Model not loaded. Please load the model first."]] | |
| if image is None: | |
| return history + [[message, "β Please upload an Android screenshot image."]] | |
| try: | |
| # Extract goal and instruction from message | |
| if "Goal:" in message and "Step:" in message: | |
| # Parse structured input | |
| lines = message.split('\n') | |
| goal = "" | |
| instruction = "" | |
| for line in lines: | |
| if line.startswith("Goal:"): | |
| goal = line.replace("Goal:", "").strip() | |
| elif line.startswith("Step:"): | |
| instruction = line.replace("Step:", "").strip() | |
| if not goal or not instruction: | |
| return history + [[message, "β Please provide both Goal and Step in your message."]] | |
| else: | |
| # Treat as general instruction | |
| goal = "Complete the requested action" | |
| instruction = message | |
| # Generate action | |
| response = self.generate_action(image, goal, instruction) | |
| return history + [[message, response]] | |
| except Exception as e: | |
| logger.error(f"Error in chat: {str(e)}") | |
| return history + [[message, f"β Error: {str(e)}"]] | |
| # Initialize demo | |
| demo_instance = LOperatorDemo() | |
| # Load example episodes | |
| def load_example_episodes(): | |
| """Load example episodes from the extracted data""" | |
| examples = [] | |
| try: | |
| # Load episode 13 | |
| with open("extracted_episodes_duckdb/episode_13/metadata.json", "r") as f: | |
| episode_13 = json.load(f) | |
| # Load episode 53 | |
| with open("extracted_episodes_duckdb/episode_53/metadata.json", "r") as f: | |
| episode_53 = json.load(f) | |
| # Load episode 73 | |
| with open("extracted_episodes_duckdb/episode_73/metadata.json", "r") as f: | |
| episode_73 = json.load(f) | |
| # Create examples | |
| examples = [ | |
| [ | |
| "extracted_episodes_duckdb/episode_13/screenshots/screenshot_1.png", | |
| f"Goal: {episode_13['goal']}\nStep: {episode_13['step_instructions'][0]}" | |
| ], | |
| [ | |
| "extracted_episodes_duckdb/episode_53/screenshots/screenshot_1.png", | |
| f"Goal: {episode_53['goal']}\nStep: {episode_53['step_instructions'][0]}" | |
| ], | |
| [ | |
| "extracted_episodes_duckdb/episode_73/screenshots/screenshot_1.png", | |
| f"Goal: {episode_73['goal']}\nStep: {episode_73['step_instructions'][0]}" | |
| ] | |
| ] | |
| except Exception as e: | |
| logger.error(f"Error loading examples: {str(e)}") | |
| examples = [] | |
| return examples | |
| # Create Gradio interface | |
| def create_demo(): | |
| """Create the Gradio demo interface""" | |
| with gr.Blocks( | |
| title="L-Operator: Android Device Control Demo", | |
| theme=gr.themes.Soft(), | |
| css=""" | |
| .gradio-container { | |
| max-width: 1200px !important; | |
| } | |
| .chat-container { | |
| height: 600px; | |
| } | |
| """ | |
| ) as demo: | |
| gr.Markdown(""" | |
| # π€ L-Operator: Android Device Control Demo | |
| **Lightweight Multimodal Android Device Control Agent** | |
| This demo showcases the L-Operator model, a fine-tuned multimodal AI agent based on LiquidAI's LFM2-VL-1.6B model, | |
| optimized for Android device control through visual understanding and action generation. | |
| ## π How to Use | |
| 1. **Load the Model**: Click the "Load Model" button to initialize the L-Operator model | |
| 2. **Upload Screenshot**: Upload an Android device screenshot | |
| 3. **Provide Instructions**: Enter your goal and step instructions | |
| 4. **Get Actions**: The model will generate JSON actions for Android device control | |
| ## π Expected Output Format | |
| The model generates JSON actions in the following format: | |
| ```json | |
| { | |
| "action_type": "tap", | |
| "x": 540, | |
| "y": 1200, | |
| "text": "Settings", | |
| "app_name": "com.android.settings", | |
| "confidence": 0.92 | |
| } | |
| ``` | |
| --- | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### π§ Model Control") | |
| load_btn = gr.Button("π Load L-Operator Model", variant="primary", size="lg") | |
| load_status = gr.Textbox(label="Model Status", value="β Model not loaded", interactive=False) | |
| # ZeroGPU status indicator | |
| if ZEROGPU_AVAILABLE: | |
| gr.Markdown("### β‘ ZeroGPU Status") | |
| gr.Markdown("π’ **ZeroGPU Enabled**: Dynamic GPU allocation for cost-effective inference") | |
| else: | |
| gr.Markdown("### β‘ ZeroGPU Status") | |
| gr.Markdown("π‘ **ZeroGPU Not Available**: Running in standard mode") | |
| # Token status indicator | |
| if HF_TOKEN: | |
| gr.Markdown("### π Authentication Status") | |
| gr.Markdown("π’ **Token Available**: HF_TOKEN found in environment") | |
| else: | |
| gr.Markdown("### π Authentication Status") | |
| gr.Markdown("π‘ **Token Missing**: HF_TOKEN not found - set in Spaces secrets") | |
| gr.Markdown("### π± Input") | |
| image_input = gr.Image( | |
| label="Android Screenshot", | |
| type="pil", | |
| height=400, | |
| tool="upload" | |
| ) | |
| gr.Markdown("### π Instructions") | |
| goal_input = gr.Textbox( | |
| label="Goal", | |
| placeholder="e.g., Open the Settings app and navigate to Display settings", | |
| lines=2 | |
| ) | |
| step_input = gr.Textbox( | |
| label="Step Instruction", | |
| placeholder="e.g., Tap on the Settings app icon on the home screen", | |
| lines=2 | |
| ) | |
| generate_btn = gr.Button("π― Generate Action", variant="secondary") | |
| with gr.Column(scale=2): | |
| gr.Markdown("### π¬ Chat Interface") | |
| chat_interface = gr.ChatInterface( | |
| fn=demo_instance.chat_with_model, | |
| additional_inputs=[image_input], | |
| title="L-Operator Chat", | |
| description="Chat with L-Operator using screenshots and text instructions", | |
| examples=load_example_episodes(), | |
| retry_btn="π Retry", | |
| undo_btn="β©οΈ Undo", | |
| clear_btn="ποΈ Clear", | |
| height=600 | |
| ) | |
| gr.Markdown("### π― Action Output") | |
| action_output = gr.JSON( | |
| label="Generated Action", | |
| value={}, | |
| height=200 | |
| ) | |
| # Event handlers | |
| def on_load_model(): | |
| return demo_instance.load_model() | |
| def on_generate_action(image, goal, step): | |
| if not image: | |
| return {"error": "Please upload an image"} | |
| if not goal or not step: | |
| return {"error": "Please provide both goal and step"} | |
| response = demo_instance.generate_action(image, goal, step) | |
| try: | |
| # Try to parse as JSON | |
| parsed = json.loads(response) | |
| return parsed | |
| except: | |
| return {"raw_response": response} | |
| load_btn.click( | |
| fn=on_load_model, | |
| outputs=load_status | |
| ) | |
| generate_btn.click( | |
| fn=on_generate_action, | |
| inputs=[image_input, goal_input, step_input], | |
| outputs=action_output | |
| ) | |
| # Update chat interface when image changes | |
| def update_chat_image(image): | |
| return image | |
| image_input.change( | |
| fn=update_chat_image, | |
| inputs=[image_input], | |
| outputs=[chat_interface.chatbot] | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| ## π Model Details | |
| | Property | Value | | |
| |----------|-------| | |
| | **Base Model** | LiquidAI/LFM2-VL-1.6B | | |
| | **Architecture** | LFM2-VL (1.6B parameters) | | |
| | **Fine-tuning** | LoRA (Low-Rank Adaptation) | | |
| | **Training Data** | Android control episodes with screenshots and actions | | |
| ## π― Use Cases | |
| - **Mobile App Testing**: Automated UI testing for Android applications | |
| - **Accessibility Applications**: Voice-controlled device navigation | |
| - **Remote Support**: Remote device troubleshooting | |
| - **Development Workflows**: UI/UX testing automation | |
| ## β‘ ZeroGPU Integration | |
| This demo is optimized for [Hugging Face Spaces ZeroGPU](https://huggingface.co/docs/hub/spaces-zerogpu), providing: | |
| - **Dynamic GPU Allocation**: NVIDIA H200 GPUs allocated on-demand | |
| - **Cost Efficiency**: Free GPU access with optimized resource utilization | |
| - **Multi-GPU Support**: Leverage multiple GPUs concurrently | |
| - **Automatic Management**: GPU resources released after function completion | |
| ### ZeroGPU Specifications | |
| - **GPU Type**: NVIDIA H200 slice | |
| - **Available VRAM**: 70GB per workload | |
| - **Supported Versions**: Gradio 4+, PyTorch 2.1.2/2.2.2/2.4.0/2.5.1, Python 3.10.13 | |
| ## β οΈ Important Notes | |
| - This model requires authentication with Hugging Face | |
| - Access is restricted to qualified investors under NDA | |
| - For investment evaluation purposes only | |
| - Model size: ~1.6B parameters, optimized for real-time use | |
| - **Token Authentication**: HF_TOKEN must be set in Spaces secrets for model access | |
| --- | |
| **Made with β€οΈ by Tonic** | [Model on Hugging Face](https://huggingface.co/Tonic/l-android-control) | [ZeroGPU Documentation](https://huggingface.co/docs/hub/spaces-zerogpu) | |
| """) | |
| return demo | |
| # Create and launch the demo | |
| if __name__ == "__main__": | |
| demo = create_demo() | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| debug=True, | |
| show_error=True, | |
| ssr_mode=False | |
| ) |