import gradio as gr
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import spaces
import numpy as np
from pydub import AudioSegment
import tempfile
import os
import subprocess
import re

# Model configuration - Using CrisperWhisper for TRUE verbatim transcription
# CrisperWhisper is designed to transcribe EVERY word including um, uh, fillers, stutters, false starts
MODEL_NAME = "nyrahealth/CrisperWhisper"
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

print(f"Loading {MODEL_NAME} for verbatim transcription...")

# Load model and processor
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(MODEL_NAME)

# Create pipeline optimized for verbatim output
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    chunk_length_s=30,
    batch_size=8,  # Reduced batch size for stability
    torch_dtype=torch_dtype,
    device=device,
)

print("Model loaded successfully!")

def get_audio_duration(audio_path):
    """Get duration of audio file in seconds."""
    try:
        audio = AudioSegment.from_file(audio_path)
        return len(audio) / 1000.0
    except:
        return None

def slice_audio(audio_path, chunk_duration=300):
    """
    Slice audio into chunks of specified duration (in seconds).
    Default is 5 minutes (300 seconds) per chunk.
    """
    audio = AudioSegment.from_file(audio_path)
    duration_ms = len(audio)
    chunk_duration_ms = chunk_duration * 1000
    
    chunks = []
    for i in range(0, duration_ms, chunk_duration_ms):
        chunk = audio[i:i + chunk_duration_ms]
        
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
            chunk.export(temp_file.name, format="wav")
            chunks.append(temp_file.name)
    
    return chunks

@spaces.GPU
def transcribe_audio_chunk(audio_input, task="transcribe", language=None, return_timestamps=False):
    """
    Transcribe a single audio chunk with CrisperWhisper.
    This model is specifically trained for verbatim transcription.
    """
    try:
        generate_kwargs = {
            "task": task,
        }
        
        if language:
            generate_kwargs["language"] = language
        
        # Only add timestamps if requested and handle the potential error
        if return_timestamps:
            try:
                generate_kwargs["return_timestamps"] = "word"
                result = pipe(audio_input, generate_kwargs=generate_kwargs)
                return result
            except RuntimeError as e:
                if "size of tensor" in str(e):
                    # Fallback to chunk-level timestamps if word-level fails
                    print("Word-level timestamps failed, trying chunk-level...")
                    generate_kwargs["return_timestamps"] = True
                    result = pipe(audio_input, generate_kwargs=generate_kwargs)
                    return result
                raise
        else:
            # No timestamps requested
            result = pipe(audio_input, generate_kwargs=generate_kwargs)
            return result
            
    except Exception as e:
        # Last resort fallback: try with minimal parameters
        print(f"Error with generate_kwargs: {e}")
        try:
            result = pipe(audio_input)
            return result
        except Exception as e2:
            raise Exception(f"Transcription failed: {str(e2)}")

def create_srt_file(transcription_data, output_path):
    """
    Create an SRT subtitle file from transcription data.
    """
    with open(output_path, 'w', encoding='utf-8') as f:
        counter = 1
        for item in transcription_data:
            start_time = item['start']
            end_time = item['end']
            text = item['text'].strip()
            
            if text:  # Only add non-empty subtitles
                # Convert seconds to SRT time format (HH:MM:SS,mmm)
                start_srt = format_timestamp_srt(start_time)
                end_srt = format_timestamp_srt(end_time)
                
                f.write(f"{counter}\n")
                f.write(f"{start_srt} --> {end_srt}\n")
                f.write(f"{text}\n\n")
                counter += 1

def format_timestamp_srt(seconds):
    """Convert seconds to SRT timestamp format."""
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    millis = int((seconds % 1) * 1000)
    return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"

def extract_audio_from_video(video_path):
    """Extract audio from video file using ffmpeg."""
    try:
        audio_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
        
        # Use ffmpeg directly for more reliable extraction
        cmd = [
            'ffmpeg',
            '-i', video_path,
            '-vn',  # No video
            '-acodec', 'pcm_s16le',
            '-ar', '16000',  # 16kHz sample rate for Whisper
            '-ac', '1',  # Mono
            '-y',  # Overwrite output
            audio_path
        ]
        
        subprocess.run(cmd, check=True, capture_output=True)
        return audio_path
    except Exception as e:
        raise Exception(f"Failed to extract audio: {str(e)}")

def burn_subtitles_to_video(video_path, srt_path, progress=gr.Progress()):
    """
    Burn subtitles into video using ffmpeg.
    """
    try:
        progress(0.7, desc="Creating video with subtitles...")
        output_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
        
        # Escape the SRT path for ffmpeg filter
        srt_escaped = srt_path.replace('\\', '\\\\').replace(':', '\\:')
        
        # Use ffmpeg to burn subtitles
        cmd = [
            'ffmpeg',
            '-i', video_path,
            '-vf', f"subtitles={srt_escaped}:force_style='FontName=Arial,FontSize=24,PrimaryColour=&HFFFFFF,OutlineColour=&H000000,Outline=2,Alignment=2,MarginV=50'",
            '-c:a', 'copy',
            '-y',
            output_path
        ]
        
        subprocess.run(cmd, check=True, capture_output=True)
        progress(1.0, desc="Done!")
        return output_path
        
    except Exception as e:
        raise Exception(f"Failed to create subtitled video: {str(e)}")

def merge_subtitle_segments(segments, max_duration=5.0, max_words=15):
    """
    Merge small subtitle segments into larger, more readable ones.
    """
    if not segments:
        return []

    merged = []
    # Start with the first segment
    current_segment = segments[0].copy()
    
    for i in range(1, len(segments)):
        next_segment = segments[i]
        
        # Combine text and calculate new word count
        new_text = current_segment['text'] + " " + next_segment['text'].lstrip()
        new_word_count = len(new_text.split())
        
        # Calculate new duration
        new_duration = next_segment['end'] - current_segment['start']

        # If merging doesn't exceed limits, merge
        if new_duration <= max_duration and new_word_count <= max_words:
            current_segment['end'] = next_segment['end']
            current_segment['text'] = new_text
        else:
            # Otherwise, save the current segment and start a new one
            merged.append(current_segment)
            current_segment = next_segment.copy()
            
    # Don't forget the last segment
    merged.append(current_segment)
    
    return merged

@spaces.GPU
def process_video(video_path, task="transcribe", language=None, subtitle_format="burned", progress=gr.Progress()):
    """
    Process video: extract audio, transcribe, and add subtitles.
    """
    if video_path is None:
        return None, "Please provide a video file.", None
    
    temp_files = []
    srt_path = None # Initialize to prevent NameError in finally block
    
    try:
        # Extract audio from video
        progress(0, desc="Extracting audio from video...")
        audio_path = extract_audio_from_video(video_path)
        temp_files.append(audio_path)
        
        # Check audio duration
        duration = get_audio_duration(audio_path)
        chunk_duration = 300  # 5 minutes per chunk
        
        if duration and duration > chunk_duration:
            progress(0.1, desc=f"Audio is {duration:.1f}s long. Slicing into chunks...")
            audio_chunks = slice_audio(audio_path, chunk_duration)
            temp_files.extend(audio_chunks)
        else:
            audio_chunks = [audio_path]
        
        # Transcribe each chunk with timestamps
        all_transcriptions = []
        total_chunks = len(audio_chunks)
        
        for idx, chunk_path in enumerate(audio_chunks):
            progress(0.1 + (idx / total_chunks) * 0.5, desc=f"Transcribing chunk {idx + 1}/{total_chunks}...")
            
            result = transcribe_audio_chunk(chunk_path, task, language, return_timestamps=True)
            
            if "chunks" in result:
                chunk_offset = idx * chunk_duration
                
                for word_chunk in result["chunks"]:
                    start = word_chunk["timestamp"][0]
                    end = word_chunk["timestamp"][1]
                    if start is not None and end is not None:
                        all_transcriptions.append({
                            "start": start + chunk_offset,
                            "end": end + chunk_offset,
                            "text": word_chunk["text"]
                        })
        
        if not all_transcriptions:
            return None, "No transcription data available. Timestamps may have failed.", None
        
        # Merge close timestamps for better subtitle readability
        progress(0.6, desc="Optimizing subtitle timing...")
        merged_transcriptions = merge_subtitle_segments(all_transcriptions, max_duration=5.0, max_words=15)
        
        # Generate full text transcript
        full_text = "".join([t["text"] for t in merged_transcriptions]).strip()
        transcript_output = f"**Verbatim Transcription:**\n{full_text}\n\n"
        transcript_output += f"*Total duration: {duration:.1f}s | {len(merged_transcriptions)} subtitle segments*"
        
        # Create SRT file (needed for all formats)
        srt_path = tempfile.NamedTemporaryFile(suffix=".srt", delete=False).name
        create_srt_file(merged_transcriptions, srt_path)
        temp_files.append(srt_path)
        
        if subtitle_format == "burned":
            # Burn subtitles into video
            output_video = burn_subtitles_to_video(video_path, srt_path, progress)
            return output_video, transcript_output, None
        
        elif subtitle_format == "srt":
            # Return SRT file only
            progress(0.7, desc="Creating SRT subtitle file...")
            return None, transcript_output, srt_path
        
        else:  # both
            progress(0.7, desc="Creating video with subtitles and SRT file...")
            output_video = burn_subtitles_to_video(video_path, srt_path, progress)
            # Create a copy of SRT for download
            srt_download = tempfile.NamedTemporaryFile(suffix=".srt", delete=False).name
            import shutil
            shutil.copy(srt_path, srt_download)
            return output_video, transcript_output, srt_download
        
    except Exception as e:
        return None, f"Error processing video: {str(e)}", None
    
    finally:
        # Clean up temporary audio files (keep video and srt outputs)
        for temp_file in temp_files:
            try:
                # srt_path could be None if an error occurs early
                if srt_path and os.path.exists(temp_file) and temp_file != srt_path:
                    os.unlink(temp_file)
                elif os.path.exists(temp_file):
                     os.unlink(temp_file)
            except:
                pass

def transcribe_audio(audio, task="transcribe", return_timestamps=False, language=None, export_srt=False, progress=gr.Progress()):
    """
    Transcribe audio with VERY VERBATIM output using CrisperWhisper.
    This model transcribes every spoken word exactly as it is, including fillers, stutters, and false starts.
    """
    if audio is None:
        return "Please provide an audio file or recording.", None

    # If SRT export is requested, we must generate timestamps.
    if export_srt:
        return_timestamps = True
        
    temp_files = []
    
    try:
        # Handle different audio input formats
        if isinstance(audio, str):
            audio_path = audio
        elif isinstance(audio, tuple):
            sr, audio_data = audio
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
                import scipy.io.wavfile
                scipy.io.wavfile.write(temp_file.name, sr, audio_data)
                audio_path = temp_file.name
                temp_files.append(audio_path)
        else:
            return "Unsupported audio format.", None
            
        # Check audio duration and slice if necessary
        duration = get_audio_duration(audio_path)
        chunk_duration = 300  # 5 minutes per chunk
        
        if duration and duration > chunk_duration:
            progress(0, desc=f"Audio is {duration:.1f}s long. Slicing into chunks...")
            audio_chunks = slice_audio(audio_path, chunk_duration)
            temp_files.extend(audio_chunks)
        else:
            audio_chunks = [audio_path]
            
        # Process each chunk
        all_word_chunks = []
        full_text_parts = []
        total_chunks = len(audio_chunks)
        
        for idx, chunk_path in enumerate(audio_chunks):
            progress((idx + 1) / total_chunks, desc=f"Transcribing chunk {idx + 1}/{total_chunks}...")
            
            result = transcribe_audio_chunk(chunk_path, task, language, return_timestamps)
            
            full_text_parts.append(result["text"])

            if return_timestamps and "chunks" in result:
                chunk_offset = idx * chunk_duration
                for word_chunk in result["chunks"]:
                    start = word_chunk["timestamp"][0]
                    end = word_chunk["timestamp"][1]
                    if start is not None and end is not None:
                        all_word_chunks.append({
                            "start": start + chunk_offset,
                            "end": end + chunk_offset,
                            "text": word_chunk["text"]
                        })
                        
        # Combine all transcriptions
        full_text = "".join(full_text_parts).strip()
        
        output = f"**Verbatim Transcription:**\n{full_text}\n"
        srt_file_path = None
        
        if return_timestamps and all_word_chunks:
             # If timestamps are requested but not for SRT, display them in the textbox
            if not export_srt:
                output += "\n**Word-level Timestamps:**\n"
                for ts in all_word_chunks:
                    output += f"[{ts['start']:.2f}s - {ts['end']:.2f}s]{ts['text']}\n"
        
        # Generate SRT file if requested
        if export_srt:
            if all_word_chunks:
                merged_transcriptions = merge_subtitle_segments(all_word_chunks, max_duration=5.0, max_words=15)
                srt_file = tempfile.NamedTemporaryFile(suffix=".srt", delete=False).name
                create_srt_file(merged_transcriptions, srt_file)
                srt_file_path = srt_file
            else:
                 output += "\n**Warning:** Could not generate SRT file as word-level timestamps were not available."

        if duration:
            output += f"\n*Total duration: {duration:.1f}s | Processed in {total_chunks} chunk(s)*"
            
        return output, srt_file_path
        
    except Exception as e:
        return f"Error during transcription: {str(e)}", None
        
    finally:
        # Clean up temporary files
        for temp_file in temp_files:
            try:
                if os.path.exists(temp_file):
                    os.unlink(temp_file)
            except:
                pass

# Language options for manual selection
LANGUAGES = {
    "Auto-detect": None,
    "English": "en",
    "Spanish": "es",
    "French": "fr",
    "German": "de",
    "Italian": "it",
    "Portuguese": "pt",
    "Dutch": "nl",
    "Russian": "ru",
    "Chinese": "zh",
    "Japanese": "ja",
    "Korean": "ko",
    "Arabic": "ar",
    "Hindi": "hi",
    "Turkish": "tr",
    "Polish": "pl",
    "Ukrainian": "uk",
    "Vietnamese": "vi",
    "Thai": "th",
    "Indonesian": "id",
    "Czech": "cs",
    "Romanian": "ro",
    "Swedish": "sv",
    "Danish": "da",
    "Norwegian": "no",
    "Finnish": "fi",
    "Greek": "el",
    "Hebrew": "he",
}

# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # 🎙️ Very Verbatim Multilingual Speech-to-Text
        
        Powered by **CrisperWhisper** - specifically designed for verbatim transcription with ZeroGPU acceleration.
        
        ## 🔥 TRUE Verbatim Transcription
        
        Unlike standard Whisper (which omits disfluencies), **CrisperWhisper captures EVERYTHING**:
        
        - ✅ **Fillers**: um, uh, ah, er, mm, like, you know
        - ✅ **Hesitations**: pauses, breath sounds, stutters
        - ✅ **False Starts**: "I was- I went to the store"
        - ✅ **Repetitions**: "I I I think that..."
        - ✅ **Disfluencies**: Every non-fluent speech element
        - ✅ **Accurate Word-Level Timestamps**: Precise timing even around disfluencies
        - ✅ **Multilingual**: Supports 99+ languages
        - ✅ **Long Audio Support**: Automatic 5-minute chunking
        - ✅ **Video Subtitles**: Automatic caption generation with burned-in or SRT output
        
        **Perfect for:** Legal transcription, linguistic research, therapy sessions, interviews, 
        conversational AI training, video subtitling, or any use case requiring exact speech capture.
        """
    )
    
    with gr.Tabs():
        # Audio Tab
        with gr.Tab("🎤 Audio Transcription"):
            with gr.Row():
                with gr.Column():
                    audio_input = gr.Audio(
                        sources=["upload", "microphone"],
                        type="filepath",
                        label="Audio Input"
                    )
                    
                    with gr.Row():
                        task_radio = gr.Radio(
                            choices=["transcribe", "translate"],
                            value="transcribe",
                            label="Task",
                            info="Transcribe verbatim or translate to English"
                        )
                        
                        language_dropdown = gr.Dropdown(
                            choices=list(LANGUAGES.keys()),
                            value="Auto-detect",
                            label="Language",
                            info="Select language or use auto-detect"
                        )
                    
                    timestamps_checkbox = gr.Checkbox(
                        label="Show word-level timestamps in text output",
                        value=False,
                        info="Display precise timing for each word"
                    )
                    
                    export_srt_checkbox = gr.Checkbox(
                        label="Export as SRT file",
                        value=False,
                        info="Generate downloadable SRT subtitle file"
                    )
                    
                    transcribe_btn = gr.Button("🎯 Transcribe Verbatim", variant="primary", size="lg")
                
                with gr.Column():
                    output_text = gr.Textbox(
                        label="Verbatim Transcription (includes all um, uh, hesitations)",
                        lines=18,
                        show_copy_button=True,
                        placeholder="Your VERY verbatim transcription will appear here...\n\nEvery um, uh, stutter, and hesitation will be captured!"
                    )
                    
                    output_audio_srt = gr.File(
                        label="Download SRT Subtitles",
                        interactive=False,
                        visible=False
                    )
            
            gr.Markdown(
                """
                ### Why CrisperWhisper for Verbatim?
                
                **Standard Whisper** is trained to transcribe the "intended meaning" - it automatically cleans up:
                - ❌ Removes "um", "uh", "ah" 
                - ❌ Omits false starts
                - ❌ Skips repetitions
                - ❌ Ignores stutters
                
                **CrisperWhisper** is specifically trained for verbatim transcription:
                - ✅ Keeps every filler word
                - ✅ Preserves all disfluencies
                - ✅ Captures exact speech patterns
                - ✅ Accurate timestamps around hesitations
                - ✅ Export as SRT file for use in video editors, YouTube, etc.
                """
            )
        
        # Video Tab
        with gr.Tab("🎬 Video Subtitles"):
            with gr.Row():
                with gr.Column():
                    video_input = gr.Video(
                        label="Video Input",
                        sources=["upload"]
                    )
                    
                    with gr.Row():
                        video_task_radio = gr.Radio(
                            choices=["transcribe", "translate"],
                            value="transcribe",
                            label="Task",
                            info="Transcribe verbatim or translate to English"
                        )
                        
                        video_language_dropdown = gr.Dropdown(
                            choices=list(LANGUAGES.keys()),
                            value="Auto-detect",
                            label="Language",
                            info="Select language or use auto-detect"
                        )
                    
                    subtitle_format_radio = gr.Radio(
                        choices=[
                            ("Burned-in subtitles (permanent)", "burned"),
                            ("SRT file only (external subtitles)", "srt"),
                            ("Both burned-in video + SRT file", "both")
                        ],
                        value="burned",
                        label="Subtitle Format",
                        info="Choose output format"
                    )
                    
                    process_video_btn = gr.Button("🎬 Generate Subtitles", variant="primary", size="lg")
                
                with gr.Column():
                    output_video = gr.Video(
                        label="Video with Subtitles",
                        interactive=False
                    )
                    
                    video_transcript = gr.Textbox(
                        label="Verbatim Transcript",
                        lines=10,
                        show_copy_button=True,
                        placeholder="Transcript will appear here..."
                    )
                    
                    output_srt = gr.File(
                        label="Download SRT Subtitles",
                        interactive=False
                    )
            
            gr.Markdown(
                """
                ### Video Subtitle Features
                
                - **Burned-in Subtitles**: Permanently embedded in video (white text with black outline)
                - **SRT File**: Standard subtitle file with timestamps (HH:MM:SS,mmm format)
                  - Compatible with YouTube, VLC, Premiere Pro, Final Cut, DaVinci Resolve
                  - Easy to edit timings and text in any text editor
                  - Can be translated and re-synced
                - **Verbatim Captions**: All hesitations, fillers, and disfluencies included
                - **Smart Timing**: Automatically merges short segments for readability
                - **Long Video Support**: Handles videos of any length (automatic chunking)
                
                ### SRT File Format Example
                ```
                1
                00:00:01,500 --> 00:00:03,200
                Um, so I was thinking that
                
                2
                00:00:03,200 --> 00:00:05,800
                we could, uh, go to the store
                ```
                
                ### Tips
                
                - Use "Burned-in" for sharing videos with guaranteed subtitle visibility
                - Use "SRT file" for flexible editing, translation, and platform uploads
                - Use "Both" to have maximum flexibility
                - SRT files work with all major video platforms and editors
                - Subtitles are positioned at the bottom center of the video
                """
            )
    
    gr.Markdown(
        """
        ### Use Cases
        
        - **Legal/Court Transcription**: Exact wording required by law
        - **Linguistic Research**: Study of natural speech patterns and disfluencies
        - **Medical/Therapy Sessions**: Capturing patient speech patterns
        - **Interview Transcription**: Preserving speaker mannerisms
        - **Conversational AI Training**: Realistic dialogue data
        - **Accessibility**: Complete transcripts and captions for deaf/hard-of-hearing
        - **Video Content**: YouTube, social media, educational content with accurate captions
        - **Language Learning**: Analyzing natural spoken language
        
        ### Tips for Best Results
        
        - Clear audio with minimal background noise works best
        - The model captures quiet speech - ensure consistent audio levels
        - Manual language selection can improve accuracy
        - Long files are automatically processed in 5-minute chunks
        - For videos, ensure good audio quality for best subtitle accuracy
        """
    )
    
    # Set up event handlers
    def transcribe_wrapper(audio, task, timestamps, export_srt, language_name, progress=gr.Progress()):
        language_code = LANGUAGES[language_name]
        transcript, srt_file = transcribe_audio(audio, task, timestamps, language_code, export_srt, progress)
        
        # Control visibility of SRT download
        srt_visible = gr.update(visible=srt_file is not None, value=srt_file)
        
        return transcript, srt_visible
    
    def video_wrapper(video, task, language_name, subtitle_format, progress=gr.Progress()):
        language_code = LANGUAGES[language_name]
        return process_video(video, task, language_code, subtitle_format, progress)
    
    transcribe_btn.click(
        fn=transcribe_wrapper,
        inputs=[audio_input, task_radio, timestamps_checkbox, export_srt_checkbox, language_dropdown],
        outputs=[output_text, output_audio_srt]
    )
    
    process_video_btn.click(
        fn=video_wrapper,
        inputs=[video_input, video_task_radio, video_language_dropdown, subtitle_format_radio],
        outputs=[output_video, video_transcript, output_srt]
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()