import gradio as gr
import torch
from PIL import Image
import json
import os
from transformers import AutoProcessor, AutoModelForImageTextToText
from typing import List, Dict, Any
import logging
import spaces

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Model configuration
MODEL_ID = "Tonic/l-android-control"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Get Hugging Face token from environment variable (Spaces secrets)
import os
HF_TOKEN = os.getenv("HF_TOKEN")
if not HF_TOKEN:
    logger.warning("HF_TOKEN not found in environment variables. Model access may be restricted.")

class LOperatorDemo:
    def __init__(self):
        self.model = None
        self.processor = None
        self.is_loaded = False
        
    def load_model(self):
        """Load the L-Operator model and processor"""
        try:
            logger.info(f"Loading model {MODEL_ID} on device {DEVICE}")
            
            # Check if token is available
            if not HF_TOKEN:
                return "❌ HF_TOKEN not found. Please set HF_TOKEN in Spaces secrets."
            
            # Load processor with token
            self.processor = AutoProcessor.from_pretrained(
                MODEL_ID, 
                trust_remote_code=True,
                token=HF_TOKEN
            )
            
            # Load model with token
            self.model = AutoModelForImageTextToText.from_pretrained(
                MODEL_ID,
                torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
                trust_remote_code=True,
                device_map="auto" if DEVICE == "cuda" else None,
                token=HF_TOKEN
            )
            
            if DEVICE == "cpu":
                self.model = self.model.to(DEVICE)
            
            self.is_loaded = True
            logger.info("Model loaded successfully with token authentication")
            return "✅ Model loaded successfully with token authentication!"
            
        except Exception as e:
            logger.error(f"Error loading model: {str(e)}")
            return f"❌ Error loading model: {str(e)}"
    
    @spaces.GPU(duration=120)  # 2 minutes for action generation
    def generate_action(self, image: Image.Image, goal: str, instruction: str) -> str:
        """Generate action based on image and text inputs"""
        if not self.is_loaded:
            return "❌ Model not loaded. Please load the model first."
        
        try:
            # Convert image to RGB if needed
            if image.mode != "RGB":
                image = image.convert("RGB")
            
            # Build conversation
            conversation = [
                {
                    "role": "system",
                    "content": [
                        {"type": "text", "text": "You are a helpful multimodal assistant by Liquid AI."}
                    ]
                },
                {
                    "role": "user",
                    "content": [
                        {"type": "image", "image": image},
                        {"type": "text", "text": f"Goal: {goal}\nStep: {instruction}\nRespond with a JSON action containing relevant keys (e.g., action_type, x, y, text, app_name, direction)."}
                    ]
                }
            ]
            
            # Process inputs
            inputs = self.processor.apply_chat_template(
                conversation,
                add_generation_prompt=True,
                return_tensors="pt"
            ).to(self.model.device)
            
            # Generate response
            with torch.no_grad():
                outputs = self.model.generate(
                    inputs,
                    max_new_tokens=128,
                    do_sample=True,
                    temperature=0.7,
                    top_p=0.9
                )
            
            response = self.processor.tokenizer.decode(
                outputs[0][inputs.shape[1]:], 
                skip_special_tokens=True
            )
            
            # Try to parse as JSON for better formatting
            try:
                parsed_response = json.loads(response)
                return json.dumps(parsed_response, indent=2)
            except:
                return response
                
        except Exception as e:
            logger.error(f"Error generating action: {str(e)}")
            return f"❌ Error generating action: {str(e)}"
    
    @spaces.GPU(duration=90)  # 1.5 minutes for chat responses
    def chat_with_model(self, message: str, history: List[List[str]], image: Image.Image = None) -> tuple:
        """Chat interface function for Gradio"""
        if not self.is_loaded:
            return history + [[message, "❌ Model not loaded. Please load the model first."]]
        
        if image is None:
            return history + [[message, "❌ Please upload an Android screenshot image."]]
        
        try:
            # Extract goal and instruction from message
            if "Goal:" in message and "Step:" in message:
                # Parse structured input
                lines = message.split('\n')
                goal = ""
                instruction = ""
                
                for line in lines:
                    if line.startswith("Goal:"):
                        goal = line.replace("Goal:", "").strip()
                    elif line.startswith("Step:"):
                        instruction = line.replace("Step:", "").strip()
                
                if not goal or not instruction:
                    return history + [[message, "❌ Please provide both Goal and Step in your message."]]
            else:
                # Treat as general instruction
                goal = "Complete the requested action"
                instruction = message
            
            # Generate action
            response = self.generate_action(image, goal, instruction)
            return history + [[message, response]]
            
        except Exception as e:
            logger.error(f"Error in chat: {str(e)}")
            return history + [[message, f"❌ Error: {str(e)}"]]

# Initialize demo
demo_instance = LOperatorDemo()

# Load example episodes
def load_example_episodes():
    """Load example episodes from the extracted data"""
    examples = []
    
    try:
        # Load episode 13
        with open("extracted_episodes_duckdb/episode_13/metadata.json", "r") as f:
            episode_13 = json.load(f)
        
        # Load episode 53
        with open("extracted_episodes_duckdb/episode_53/metadata.json", "r") as f:
            episode_53 = json.load(f)
        
        # Load episode 73
        with open("extracted_episodes_duckdb/episode_73/metadata.json", "r") as f:
            episode_73 = json.load(f)
        
        # Create examples
        examples = [
            [
                "extracted_episodes_duckdb/episode_13/screenshots/screenshot_1.png",
                f"Goal: {episode_13['goal']}\nStep: {episode_13['step_instructions'][0]}"
            ],
            [
                "extracted_episodes_duckdb/episode_53/screenshots/screenshot_1.png",
                f"Goal: {episode_53['goal']}\nStep: {episode_53['step_instructions'][0]}"
            ],
            [
                "extracted_episodes_duckdb/episode_73/screenshots/screenshot_1.png",
                f"Goal: {episode_73['goal']}\nStep: {episode_73['step_instructions'][0]}"
            ]
        ]
        
    except Exception as e:
        logger.error(f"Error loading examples: {str(e)}")
        examples = []
    
    return examples

# Create Gradio interface
def create_demo():
    """Create the Gradio demo interface"""
    
    with gr.Blocks(
        title="L-Operator: Android Device Control Demo",
        theme=gr.themes.Soft(),
        css="""
        .gradio-container {
            max-width: 1200px !important;
        }
        .chat-container {
            height: 600px;
        }
        """
    ) as demo:
        
        gr.Markdown("""
        # 🤖 L-Operator: Android Device Control Demo
        
        **Lightweight Multimodal Android Device Control Agent**
        
        This demo showcases the L-Operator model, a fine-tuned multimodal AI agent based on LiquidAI's LFM2-VL-1.6B model, 
        optimized for Android device control through visual understanding and action generation.
        
        ## 🚀 How to Use
        
        1. **Load the Model**: Click the "Load Model" button to initialize the L-Operator model
        2. **Upload Screenshot**: Upload an Android device screenshot
        3. **Provide Instructions**: Enter your goal and step instructions
        4. **Get Actions**: The model will generate JSON actions for Android device control
        
        ## 📋 Expected Output Format
        
        The model generates JSON actions in the following format:
        ```json
        {
          "action_type": "tap",
          "x": 540,
          "y": 1200,
          "text": "Settings",
          "app_name": "com.android.settings",
          "confidence": 0.92
        }
        ```
        
        ---
        """)
        
        with gr.Row():
            with gr.Column(scale=1):
                gr.Markdown("### 🔧 Model Control")
                load_btn = gr.Button("🚀 Load L-Operator Model", variant="primary", size="lg")
                load_status = gr.Textbox(label="Model Status", value="❌ Model not loaded", interactive=False)
                
                # ZeroGPU status indicator
                if ZEROGPU_AVAILABLE:
                    gr.Markdown("### ⚡ ZeroGPU Status")
                    gr.Markdown("🟢 **ZeroGPU Enabled**: Dynamic GPU allocation for cost-effective inference")
                else:
                    gr.Markdown("### ⚡ ZeroGPU Status")
                    gr.Markdown("🟡 **ZeroGPU Not Available**: Running in standard mode")
                
                # Token status indicator
                if HF_TOKEN:
                    gr.Markdown("### 🔐 Authentication Status")
                    gr.Markdown("🟢 **Token Available**: HF_TOKEN found in environment")
                else:
                    gr.Markdown("### 🔐 Authentication Status")
                    gr.Markdown("🟡 **Token Missing**: HF_TOKEN not found - set in Spaces secrets")
                
                gr.Markdown("### 📱 Input")
                image_input = gr.Image(
                    label="Android Screenshot",
                    type="pil",
                    height=400,
                    tool="upload"
                )
                
                gr.Markdown("### 📝 Instructions")
                goal_input = gr.Textbox(
                    label="Goal",
                    placeholder="e.g., Open the Settings app and navigate to Display settings",
                    lines=2
                )
                
                step_input = gr.Textbox(
                    label="Step Instruction",
                    placeholder="e.g., Tap on the Settings app icon on the home screen",
                    lines=2
                )
                
                generate_btn = gr.Button("🎯 Generate Action", variant="secondary")
                
            with gr.Column(scale=2):
                gr.Markdown("### 💬 Chat Interface")
                chat_interface = gr.ChatInterface(
                    fn=demo_instance.chat_with_model,
                    additional_inputs=[image_input],
                    title="L-Operator Chat",
                    description="Chat with L-Operator using screenshots and text instructions",
                    examples=load_example_episodes(),
                    retry_btn="🔄 Retry",
                    undo_btn="↩️ Undo",
                    clear_btn="🗑️ Clear",
                    height=600
                )
                
                gr.Markdown("### 🎯 Action Output")
                action_output = gr.JSON(
                    label="Generated Action",
                    value={},
                    height=200
                )
        
        # Event handlers
        def on_load_model():
            return demo_instance.load_model()
        
        def on_generate_action(image, goal, step):
            if not image:
                return {"error": "Please upload an image"}
            
            if not goal or not step:
                return {"error": "Please provide both goal and step"}
            
            response = demo_instance.generate_action(image, goal, step)
            
            try:
                # Try to parse as JSON
                parsed = json.loads(response)
                return parsed
            except:
                return {"raw_response": response}
        
        load_btn.click(
            fn=on_load_model,
            outputs=load_status
        )
        
        generate_btn.click(
            fn=on_generate_action,
            inputs=[image_input, goal_input, step_input],
            outputs=action_output
        )
        
        # Update chat interface when image changes
        def update_chat_image(image):
            return image
        
        image_input.change(
            fn=update_chat_image,
            inputs=[image_input],
            outputs=[chat_interface.chatbot]
        )
        
        gr.Markdown("""
        ---
        
        ## 📊 Model Details
        
        | Property | Value |
        |----------|-------|
        | **Base Model** | LiquidAI/LFM2-VL-1.6B |
        | **Architecture** | LFM2-VL (1.6B parameters) |
        | **Fine-tuning** | LoRA (Low-Rank Adaptation) |
        | **Training Data** | Android control episodes with screenshots and actions |
        
        ## 🎯 Use Cases
        
        - **Mobile App Testing**: Automated UI testing for Android applications
        - **Accessibility Applications**: Voice-controlled device navigation
        - **Remote Support**: Remote device troubleshooting
        - **Development Workflows**: UI/UX testing automation
        
        ## ⚡ ZeroGPU Integration
        
        This demo is optimized for [Hugging Face Spaces ZeroGPU](https://huggingface.co/docs/hub/spaces-zerogpu), providing:
        
        - **Dynamic GPU Allocation**: NVIDIA H200 GPUs allocated on-demand
        - **Cost Efficiency**: Free GPU access with optimized resource utilization
        - **Multi-GPU Support**: Leverage multiple GPUs concurrently
        - **Automatic Management**: GPU resources released after function completion
        
        ### ZeroGPU Specifications
        - **GPU Type**: NVIDIA H200 slice
        - **Available VRAM**: 70GB per workload
        - **Supported Versions**: Gradio 4+, PyTorch 2.1.2/2.2.2/2.4.0/2.5.1, Python 3.10.13
        
        ## ⚠️ Important Notes
        
        - This model requires authentication with Hugging Face
        - Access is restricted to qualified investors under NDA
        - For investment evaluation purposes only
        - Model size: ~1.6B parameters, optimized for real-time use
        - **Token Authentication**: HF_TOKEN must be set in Spaces secrets for model access
        
        ---
        
        **Made with ❤️ by Tonic** | [Model on Hugging Face](https://huggingface.co/Tonic/l-android-control) | [ZeroGPU Documentation](https://huggingface.co/docs/hub/spaces-zerogpu)
        """)
    
    return demo

# Create and launch the demo
if __name__ == "__main__":
    demo = create_demo()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        debug=True,
        show_error=True,
        ssr_mode=False
    )