import gradio as gr
import torch
from PIL import Image
from transformers import (
    BlipProcessor,
    BlipForConditionalGeneration,
    AutoTokenizer,
    AutoModelForSeq2SeqLM
)
from typing import Union
from gtts import gTTS
import os
import uuid
import time
import gc

torch.set_num_threads(2)
_pipeline = None

def init_pipeline():
    global _pipeline
    if _pipeline is None:
        _pipeline = ImageCaptionPipeline()
    return _pipeline

class ImageCaptionPipeline:
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        start_time = time.time()
        self.blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large", use_fast=True)
        self.blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(self.device)
        print(f"Время загрузки BLIP: {time.time() - start_time:.2f} секунд")
        
        start_time = time.time()
        self.translator_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ru")
        self.translator_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ru").to(self.device)
        print(f"Время загрузки переводчика: {time.time() - start_time:.2f} секунд")

    def generate_captions(self, image: Union[str, Image.Image]) -> tuple:
        start_time = time.time()
        if isinstance(image, str):
            image = Image.open(image)
        image = image.convert("RGB")
        image = image.resize((512, 512), Image.Resampling.LANCZOS)
        inputs = self.blip_processor(images=image, return_tensors="pt").to(self.device)
        with torch.no_grad():
            output_ids = self.blip_model.generate(**inputs, max_length=50, num_beams=2, early_stopping=True)
            english_caption = self.blip_processor.decode(output_ids[0], skip_special_tokens=True)
        print(f"Время генерации английской подписи: {time.time() - start_time:.2f} секунд")
        
        start_time = time.time()
        translated_inputs = self.translator_tokenizer(english_caption, return_tensors="pt", padding=True).to(self.device)
        with torch.no_grad():
            translated_ids = self.translator_model.generate(
                **translated_inputs,
                max_length=50,
                num_beams=2,
                early_stopping=True
            )
            russian_caption = self.translator_tokenizer.decode(translated_ids[0], skip_special_tokens=True)
        print(f"Время перевода на русский: {time.time() - start_time:.2f} секунд")
        
        gc.collect()
        return english_caption, russian_caption

    def generate_audio(self, text: str, language: str) -> str:
        start_time = time.time()
        lang_code = "ru" if language == "Русский" else "en"
        tts = gTTS(text=text, lang=lang_code)
        audio_path = f"caption_audio_{uuid.uuid4()}.mp3"
        tts.save(audio_path)
        print(f"Время генерации озвучки: {time.time() - start_time:.2f} секунд")
        return audio_path

def generate_captions(image: Image.Image) -> tuple:
    if image is not None:
        pipeline = init_pipeline()
        english_caption, russian_caption = pipeline.generate_captions(image)
        return english_caption, russian_caption, None
    return "Загрузите изображение.", "Загрузите изображение.", None

def generate_audio(english_caption: str, russian_caption: str, audio_language: str) -> str:
    if not english_caption and not russian_caption:
        return None
    pipeline = init_pipeline()
    text = russian_caption if audio_language == "Русский" else english_caption
    return pipeline.generate_audio(text, audio_language)

with gr.Blocks(css="""
    .btn {
        width: 200px; 
        background-color: #4B0082; 
        color: white;  
        font-size: 16px; 
    } 
    .equal-height {
        height: 100px !important;
    }
""") as iface:
    with gr.Row():
        with gr.Column(scale=1, min_width=400, variant="panel"):
            with gr.Row():
                image = gr.Image(type="pil", label="Изображение", height=400, width=400)

            with gr.Row():
                submit_button = gr.Button("Сгенерировать описание", elem_classes="btn")

            
        with gr.Column(scale=1, variant="panel"):
            with gr.Row():
                english_caption = gr.Textbox(label="Английский язык:", lines=1, interactive=False)
                russian_caption = gr.Textbox(label="Русский язык:", lines=1, interactive=False)
            with gr.Row():
                audio_language = gr.Dropdown(
                    choices=["Русский", "English"], 
                    label="Язык озвучки", 
                    value="Русский", 
                    elem_classes="equal-height"
                )
                audio_output = gr.Audio(
                    label="Озвучка", 
                    elem_classes="equal-height"
                )
            with gr.Row():
                audio_button = gr.Button("Сгенерировать озвучку", elem_classes="btn")
    
    submit_button.click(
        fn=generate_captions,
        inputs=[image],
        outputs=[english_caption, russian_caption]
    )
    
    audio_button.click(
        fn=generate_audio,
        inputs=[english_caption, russian_caption, audio_language],
        outputs=[audio_output]
    )

if __name__ == "__main__":
    iface.launch()

# Пум-пуммм..