ranialahassn commited on
Commit
8660e20
Β·
verified Β·
1 Parent(s): 393e560

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -0
app.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import torchaudio
4
+ import numpy as np
5
+ import librosa
6
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
7
+ from transformers import XLMRobertaTokenizerFast, XLMRobertaForSequenceClassification
8
+ import soundfile as sf
9
+
10
+ # --- Load models ---
11
+ whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-base")
12
+ whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
13
+
14
+ lang_tokenizer = XLMRobertaTokenizerFast.from_pretrained("papluca/xlm-roberta-base-language-detection")
15
+ lang_model = XLMRobertaForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection")
16
+
17
+
18
+ # --- Convert audio to text ---
19
+ def audio_to_text(audio_path):
20
+ audio_input, sample_rate = torchaudio.load(audio_path)
21
+ if sample_rate != 16000:
22
+ audio_input = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(audio_input)
23
+
24
+ input_features = whisper_processor(
25
+ audio_input.squeeze(), sampling_rate=16000, return_tensors="pt"
26
+ ).input_features
27
+
28
+ predicted_ids = whisper_model.generate(input_features)
29
+ transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
30
+ return transcription.strip()
31
+
32
+
33
+ # --- Detect language from text ---
34
+ def detect_language(text):
35
+ inputs = lang_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
36
+ with torch.no_grad():
37
+ outputs = lang_model(**inputs)
38
+ probs = torch.nn.functional.softmax(outputs.logits, dim=1)
39
+ pred_idx = probs.argmax().item()
40
+ pred_label = lang_model.config.id2label[pred_idx]
41
+ confidence = probs[0][pred_idx].item() * 100
42
+ return f"🌐 Language: {pred_label} | Confidence: {confidence:.2f}%"
43
+
44
+
45
+ # --- Gradio function ---
46
+ def detect_language_from_audio(audio_file):
47
+ if audio_file is None:
48
+ return "❌ No file selected."
49
+
50
+ try:
51
+ # Save audio temporarily in WAV format if needed
52
+ temp_wav = "temp.wav"
53
+ data, sr = librosa.load(audio_file, sr=16000)
54
+ sf.write(temp_wav, data, sr)
55
+
56
+ # Step 1: Convert audio to text
57
+ text = audio_to_text(temp_wav)
58
+ if not text:
59
+ return "❌ Failed to extract text from audio."
60
+
61
+ # Step 2: Detect language
62
+ return detect_language(text)
63
+
64
+ except Exception as e:
65
+ return f"❌ Runtime error: {str(e)}"
66
+
67
+
68
+ # --- Gradio Interface ---
69
+ iface = gr.Interface(
70
+ fn=detect_language_from_audio,
71
+ inputs=gr.Audio(type="filepath", label="Choose Audio File (WAV/MP3)"),
72
+ outputs=gr.Textbox(label="Result"),
73
+ title="πŸŽ™οΈ Voice Language Detector",
74
+ description="Upload a voice file and the model will detect its language using Whisper + XLM-Roberta."
75
+ )
76
+
77
+ # --- Entry point ---
78
+ if __name__ == "__main__":
79
+ iface.launch()