import gradio as gr import torch import torchaudio from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor # Load model and processor from Hugging Face model_name = "Dpngtm/wav2vec2-emotion-recognition" processor = Wav2Vec2Processor.from_pretrained(model_name) model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name) # Emotion labels from the model card labels = ["angry", "calm", "disgust", "fearful", "happy", "neutral", "sad", "surprised"] # Emotion prediction function def predict_emotion(audio): speech, sr = audio if sr != 16000: resampler = torchaudio.transforms.Resample(sr, 16000) speech = resampler(torch.tensor(speech)) else: speech = torch.tensor(speech) input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values with torch.no_grad(): logits = model(input_values).logits predicted_id = torch.argmax(logits, dim=-1).item() emotion = labels[predicted_id] return f"Predicted Emotion: **{emotion}**" # Gradio interface interface = gr.Interface( fn=predict_emotion, inputs=gr.Audio(source="microphone", type="numpy", label="Speak or Upload Audio"), outputs=gr.Markdown(label="Detected Emotion"), title="Voice Emotion Recognition", description="This app detects the emotional tone of your speech using a fine-tuned Wav2Vec2 model." ) interface.launch()