import cv2
import torch
from PIL import Image
import numpy as np
import os
import shutil
import gradio as gr
import mediapipe as mp
from transformers import LlavaNextVideoProcessor, LlavaNextVideoForConditionalGeneration, BitsAndBytesConfig

device = "cuda" if torch.cuda.is_available() else "cpu"
model_id = "llava-hf/LLaVA-NeXT-Video-7B-hf"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

model = LlavaNextVideoForConditionalGeneration.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    low_cpu_mem_usage=True,
    device_map="auto"
)

processor = LlavaNextVideoProcessor.from_pretrained(model_id)

mpHands = mp.solutions.hands
hands = mpHands.Hands(static_image_mode=True, max_num_hands=2)
mpDraw = mp.solutions.drawing_utils

def track_hand_position(frame):
    height, width = frame.shape[:2]
    mid_width = width // 2
    
    imgRGB = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(imgRGB)
    
    hand_positions = []
    
    if results.multi_hand_landmarks:
        for handLms in results.multi_hand_landmarks:
            cx_values = []
            for lm in handLms.landmark:
                cx = int(lm.x * width)
                cx_values.append(cx)
            
            avg_cx = sum(cx_values) / len(cx_values)
            
            if avg_cx < mid_width:
                hand_positions.append("Region A")
            else:
                hand_positions.append("Region B")
            
            mpDraw.draw_landmarks(frame, handLms, mpHands.HAND_CONNECTIONS)
    
    return frame, hand_positions

def add_regions_to_frame(frame, frame_idx, output_dir):
    height, width = frame.shape[:2]
    mid_width = width // 2
    
    overlay = frame.copy()
    cv2.rectangle(overlay, (0, 0), (mid_width, height), (255, 0, 0), -1)
    cv2.rectangle(overlay, (mid_width, 0), (width, height), (0, 255, 0), -1)
    
    frame = cv2.addWeighted(frame, 0.7, overlay, 0.3, 0)
    
    cv2.line(frame, (mid_width, 0), (mid_width, height), (255, 255, 255), 3)
    
    cv2.putText(frame, "Region A", (mid_width//4, height//2), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 255, 255), 3)
    cv2.putText(frame, "Region B", (mid_width + mid_width//4, height//2), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 255, 255), 3)
    
    tracked_frame, hand_pos = track_hand_position(frame.copy())
    
    cv2.imwrite(f"{output_dir}/frame_{frame_idx:03d}.jpg", tracked_frame)
    
    return tracked_frame, hand_pos

def sample_frames(video_path, num_frames):
    output_dir = "/tmp/processed_frames"
    
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    os.makedirs(output_dir)
    
    video = cv2.VideoCapture(video_path)
    
    if not video.isOpened():
        raise ValueError(f"Could not open video file: {video_path}")
    
    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    interval = max(1, total_frames // num_frames)
    frames = []
    frame_count = 0
    hand_tracking_log = []
    
    for i in range(total_frames):
        ret, frame = video.read()
        if not ret:
            continue
        if i % interval == 0 and len(frames) < num_frames:
            processed_frame, hand_positions = add_regions_to_frame(frame, frame_count, output_dir)
            pil_img = Image.fromarray(cv2.cvtColor(processed_frame, cv2.COLOR_BGR2RGB))
            frames.append(pil_img)
            hand_tracking_log.append(f"Frame {frame_count}: {hand_positions}")
            frame_count += 1
    
    video.release()
    
    frame_paths = [f"{output_dir}/frame_{i:03d}.jpg" for i in range(frame_count)]
    
    return frames, frame_paths, hand_tracking_log

def analyze_video(video_path):
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": """Analyze this gas pipe quality control video and classify into one category: 
                1) PASSED - pipe taken from Region A, dipped in water, no bubbles, moved to Region B. 
                Example: Person picks pipe from left side, tests in water, no bubbles seen, places in right side. 
                2) FAILED - pipe tested in water, bubbles visible. Example: Person dips pipe in water, bubbles appear indicating leak, pipe rejected. 
                3) CHEATING - pipe moved from A to B without testing. Example: Person takes pipe from left and directly places in right without water test. 
                Give classification and brief reason."""},
                {"type": "video"},
            ],
        },
    ]
    
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    
    video_frames, frame_paths, hand_log = sample_frames(video_path, 8)
    
    inputs = processor(text=prompt, videos=video_frames, padding=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    output = model.generate(
        **inputs,
        max_new_tokens=150,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        top_k=50,
        repetition_penalty=1.1,
        pad_token_id=processor.tokenizer.eos_token_id
    )
    
    result = processor.decode(output[0][2:], skip_special_tokens=True)
    
    hand_tracking_summary = "\n".join(hand_log)
    
    return frame_paths, result, hand_tracking_summary

examples = [
    ["07.mp4"],
    ["07_part1.mp4"], 
    ["07_part2.mp4"]
]

iface = gr.Interface(
    fn=analyze_video,
    inputs=gr.Video(),
    outputs=[
        gr.Gallery(label="Processed Frames"),
        gr.Textbox(label="LLM Analysis", lines=10),
        gr.Textbox(label="Hand Tracking Log", lines=15)
    ],
    title="Gas Pipe Quality Control Analyzer",
    examples=examples,
    cache_examples=False
)

iface.launch(share=True)