import gradio as gr
import torch
import torchaudio
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor

# Load model and processor from Hugging Face
model_name = "Dpngtm/wav2vec2-emotion-recognition"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)

# Emotion labels from the model card
labels = ["angry", "calm", "disgust", "fearful", "happy", "neutral", "sad", "surprised"]

# Emotion prediction function
def predict_emotion(audio):
    speech, sr = audio
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(sr, 16000)
        speech = resampler(torch.tensor(speech))
    else:
        speech = torch.tensor(speech)

    input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values

    with torch.no_grad():
        logits = model(input_values).logits

    predicted_id = torch.argmax(logits, dim=-1).item()
    emotion = labels[predicted_id]
    return f"Predicted Emotion: **{emotion}**"

# Gradio interface
interface = gr.Interface(
    fn=predict_emotion,
    inputs=gr.Audio(source="microphone", type="numpy", label="Speak or Upload Audio"),
    outputs=gr.Markdown(label="Detected Emotion"),
    title="Voice Emotion Recognition",
    description="This app detects the emotional tone of your speech using a fine-tuned Wav2Vec2 model."
)

interface.launch()