import os
import numpy as np
from transformers import pipeline
import speech_recognition as sr
import gradio as gr
import cv2
from PIL import Image
import moviepy.editor as mp
from gtts import gTTS
from groq import Groq


client = Groq(
    api_key="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",
)

# Initialize pipelines
image_pipeline = pipeline("image-classification", model="trpakov/vit-face-expression", top_k=1)
audio_pipeline = pipeline("audio-classification", model="audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim")
text_pipeline = pipeline("text-classification", model="SamLowe/roberta-base-go_emotions", top_k=2)

conversation_history = []

def process_input(video_stream):
    if isinstance(video_stream, str):
        video_file_path = video_stream
     
    # Process video frames
    image_features_list = []
    audio_emotion = ""
    text_input = ""
    text_emotions = ""

    cap = cv2.VideoCapture(video_file_path)
    frame_count = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Convert frame to PIL image
        pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

        # Analyze the image
        try:
            image_analysis = image_pipeline(pil_image)
            if image_analysis:
                image_features_list.append(image_analysis[0]['label'])
        except Exception as e:
            print(f"Error processing image data: {e}")

        # Increment frame count
        frame_count += 1

    # Combine image features into a single string
    image_features = ', '.join(image_features_list)
    print("Image features:", image_features)

    # Process audio data and get the emotion label
    try:
        # Extract audio from the video file
        video_clip = mp.VideoFileClip(video_file_path)
        audio_file_path = os.path.join("/tmp", "audio.wav")
        video_clip.audio.write_audiofile(audio_file_path)

        recognizer = sr.Recognizer()
        with sr.AudioFile(audio_file_path) as source:
            audio = recognizer.record(source)

        # Convert audio data to numpy array
        audio_data = np.frombuffer(audio.frame_data, dtype=np.int16)
        audio_data = audio_data.astype(np.float32)  # Convert to float32

        audio_emotions = audio_pipeline(audio_data)
        if audio_emotions:
            audio_emotion = audio_emotions[0]['label']
            print("Audio emotion:", audio_emotion)

            # Recognize audio
            text_input = recognizer.recognize_google(audio)
            print("User said:", text_input)
    except Exception as e:
        print(f"Error processing audio data: {e}")

    # Process text data and get the emotion label
    text_emotions = ""
    try:
        # Initialize text_input in case it's not set
        if not text_input:
            text_input = ""

        text_analysis = text_pipeline(text_input)
        print("text analysis:", text_analysis)

        if isinstance(text_analysis, list):
            # Flatten the list of lists
            text_analysis = [item for sublist in text_analysis for item in sublist]

            # Initialize an empty list to store the text emotions
            text_emotions_list = []

            # Iterate through each item in the flattened list
            for item in text_analysis:
                # Ensure each item is a dictionary and contains the 'label' key
                if isinstance(item, dict) and 'label' in item:
                    # Append the 'label' value to the text_emotions_list
                    text_emotions_list.append(item['label'])

            # Check if text_emotions_list is empty
            if text_emotions_list:
                # Convert the text_emotions_list to a comma-separated string
                text_emotions = ', '.join(text_emotions_list)
                print("Text emotions:", text_emotions)
            else:
                text_emotions = "No significant emotions detected in the text."

    except Exception as e:
        print(f"Error processing text data: {e}")
        
    print("Text emotions:", text_emotions)

    conversation_history.append({
                "user_input": text_input,
                "image_features": image_features,
                "audio_emotion": audio_emotion,
                "text_emotions": text_emotions
            })

    prompt = "User said: " + text_input
    if image_features:
        prompt += "\nImage features: " + ', '.join(image_features)
    if audio_emotion:
        prompt += "\nAudio emotion: " + audio_emotion
    if text_emotions:
        prompt += "\nText emotions: " + text_emotions
        
    print("image_feature",image_features)
    print("Audio",audio_emotion)
    print("text emotions",text_emotions)

    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "system",
                "content": "As a mental health therapist, you're speaking to a user who is seeking guidance and support. They may be experiencing various challenges and are looking for solutions to improve their mental well-being. Your responses should be empathetic, supportive, and offer practical advice tailored to the user's specific issues. Remember to maintain a positive and non-judgmental tone throughout the interaction."
                },
            {"role": "user",
                "content": prompt
            }
        ],
        model="llama3-70b-8192",
        temperature=0.5,
        max_tokens=1024,
        top_p=1,
        stop=None,
        stream=False,
    )

    ai_response = chat_completion.choices[0].message.content
    conversation_history.append({"ai_response": ai_response})
    print(ai_response)
    
    # Convert AI response to audio
    tts = gTTS(text=ai_response, lang='en')
    audio_file_path = "/tmp/ai_response.wav"
    tts.save(audio_file_path)
    
    return ai_response,audio_file_path,display_history()  # Return the generated response

def display_history():
    history_str = ""
    for i, turn in enumerate(conversation_history):
        # if "user_input" in turn:
        #     history_str += f"User: {turn['user_input']}\n"
        if "ai_response" in turn:
            history_str += f"{turn['ai_response']}\n\n"
    return history_str

# Create the Gradio interface
input_video = gr.Video(sources="webcam",label="Your Video", include_audio=True)
output_text = gr.Textbox(label="Therapist Response")
output_audio=gr.Audio(autoplay=True,visible=False)
history_text = gr.Textbox(display_history(), label="Conversation History", placeholder="")


iface = gr.Interface(fn=process_input, inputs=input_video, outputs=[output_text,output_audio,history_text], title="Mental Health Therapist", description="Speak to the AI through video input and get responses.",theme=gr.themes.Default(primary_hue="teal", secondary_hue="cyan"),allow_flagging="auto")
iface.launch(debug=True)