Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from transformers import pipeline | |
| from pydub import AudioSegment | |
| import os | |
| import speech_recognition as sr | |
| html_seeker='''<style> | |
| html, body { | |
| margin: 0; | |
| padding: 0; | |
| min-width: 900px; | |
| } | |
| #header { | |
| /*position: fixed;*/ | |
| top: 0; | |
| left: 0; | |
| height: 50px; | |
| min-width: 900px; | |
| line-height: 50px; | |
| width: 100%; | |
| background-color: #999; | |
| box-shadow: 0px 0px 5px 0px rgba(0,0,0,0.5); | |
| font-family: Helvetica, sans-serif; | |
| } | |
| #header, #header a { | |
| color: white; | |
| } | |
| .home { | |
| margin: 0; | |
| font-weight: bold; | |
| text-transform: lowercase; | |
| width: 100px; | |
| } | |
| h4.home { | |
| margin: 0; | |
| background: #666; | |
| padding-left: 25px; | |
| padding-right: 30px; | |
| margin-right: 20px; | |
| float: left; | |
| text-decoration: none; | |
| } | |
| .home:hover a { | |
| background: #555; | |
| } | |
| #audio { | |
| margin-left: 10px; | |
| width: 500px; | |
| display: inline-block; | |
| } | |
| #transcript { | |
| margin: 0 15px; | |
| margin-bottom: 5em; | |
| white-space: pre-wrap; | |
| line-height: 2em; | |
| max-width: 600px; | |
| color: #999; | |
| clear: both; | |
| margin-top: 75px; | |
| /*direction: rtl;*/ | |
| } | |
| .success { | |
| color: black; | |
| } | |
| .success:hover { | |
| text-decoration: underline; | |
| } | |
| .active { | |
| color: magenta; | |
| background-color: yellow; | |
| } | |
| #preloader { | |
| visibility: hidden; | |
| } | |
| </style><div id="header"> | |
| <audio id="audio" src="17.mp3" controls="true" ></audio> | |
| </div> | |
| </div> | |
| <div id="transcript" dir="auto"></div> | |
| <img src="" onload=" | |
| var oldScript = document.querySelector('script#huihiuh6'); | |
| var newScript = document.createElement('script'); | |
| Array.from(oldScript.attributes) | |
| .forEach( attr => newScript.setAttribute(attr.name, attr.value) ); | |
| newScript.appendChild(document.createTextNode(oldScript.innerHTML)); | |
| oldScript.parentNode.replaceChild(newScript, oldScript); | |
| "> | |
| <script id="huihiuh6"> | |
| function myFunction543rr(){ | |
| console.log('loaded00000000000000002'); | |
| } | |
| var $a = document.getElementById("audio"); | |
| $a.src=document.querySelector('audio').src; | |
| console.log($a); | |
| window.onkeydown = function(ev) { | |
| if(ev.keyCode == 32) { | |
| ev.preventDefault(); | |
| $a.pause(); | |
| } | |
| } | |
| var $trans = document.getElementById("transcript"); | |
| var wds = []; | |
| var cur_wd; | |
| function highlight_word() { | |
| var t = $a.currentTime; | |
| // XXX: O(N); use binary search | |
| var hits = wds.filter(function(x) { | |
| return (t - x['timestamp']['0']) > 0.01 && (x['timestamp']['1'] - t) > 0.01; | |
| }, wds); | |
| var next_wd = hits[hits.length - 1]; | |
| if(cur_wd != next_wd) { | |
| var active = document.querySelectorAll('.active'); | |
| for(var i = 0; i < active.length; i++) { | |
| active[i].classList.remove('active'); | |
| } | |
| if(next_wd && next_wd.$div) { | |
| next_wd.$div.classList.add('active'); | |
| //render_phones(next_wd); | |
| } | |
| } | |
| cur_wd = next_wd; | |
| //highlight_phone(t); | |
| window.requestAnimationFrame(highlight_word); | |
| } | |
| window.requestAnimationFrame(highlight_word); | |
| $trans.innerHTML = "Loading..."; | |
| function render(ret) { | |
| wds = ret['chunks'] || []; | |
| transcript = ret['text']; | |
| $trans.innerHTML = ''; | |
| var currentOffset = 0; | |
| wds.forEach(function(wd) { | |
| var $wd = document.createElement('span'); | |
| var txt = wd['text']; | |
| var $wdText = document.createTextNode(txt); | |
| $wd.appendChild($wdText); | |
| wd.$div = $wd; | |
| $wd.className = 'success'; | |
| $wd.onclick = function() { | |
| console.log(wd['timestamp']['0']); | |
| $a.currentTime = wd['timestamp']['0']; | |
| $a.play(); | |
| }; | |
| $trans.appendChild($wd); | |
| $trans.appendChild(document.createTextNode(' ')); | |
| }); | |
| } | |
| function update() { | |
| if(INLINE_JSON) { | |
| // We want this to work from file:/// domains, so we provide a | |
| // mechanism for inlining the alignment data. | |
| render(INLINE_JSON); | |
| } | |
| } | |
| var INLINE_JSON=''' | |
| html_seeker2='''; | |
| update(); | |
| </script>''' | |
| # model_name = "voidful/wav2vec2-xlsr-multilingual-56" | |
| # model0 = pipeline(task="automatic-speech-recognition", | |
| # model=model_name) | |
| # model_name = "SLPL/Sharif-wav2vec2" | |
| # model2 = pipeline(task="automatic-speech-recognition", | |
| # model=model_name) | |
| # model_name = "ghofrani/common8" | |
| # model1 = pipeline(task="automatic-speech-recognition", | |
| # model=model_name) | |
| import json | |
| def predict_fa(speech,model): | |
| if model== "SLPL/Sharif-wav2vec2": | |
| text = model2(speech,return_timestamps="word" ) | |
| elif model== "ghofrani/common8": | |
| text = model1(speech,return_timestamps="word" ) | |
| elif model== "voidful/wav2vec2-xlsr-multilingual-56": | |
| text = model0(speech,return_timestamps="word" ) | |
| return [text['text'],json.dumps(text),html_seeker+json.dumps(text)+html_seeker2] | |
| def convert_to_wav(filename): | |
| filenameObj=os.path.splitext(filename) | |
| audio = AudioSegment.from_file(filename,format=filenameObj[1].replace(".","")) | |
| new_filename = filenameObj[0] + ".wav" | |
| while os.path.exists(new_filename): | |
| new_filename = os.path.splitext(new_filename)[0]+"(1)"+ ".wav" | |
| audio.export(new_filename, format="wav") | |
| print(f"Converting {filename} to {new_filename}...") | |
| return new_filename | |
| def g_rec(audio_File ,language): | |
| r = sr.Recognizer() | |
| print(audio_File) | |
| #if not os.path.splitext(audio_File)[1]==".wav": | |
| # audio_File=convert_to_wav(audio_File) | |
| hellow=sr.AudioFile(audio_File) | |
| with hellow as source: | |
| audio = r.record(source) | |
| try: | |
| s = r.recognize_google(audio,language =language) | |
| res= s | |
| except Exception as e: | |
| res= "Exception: "+str(e) | |
| return res | |
| # Export file as .wav | |
| #predict(load_file_to_data('audio file path',sampling_rate=16_000)) # beware of the audio file sampling rate | |
| #predict_lang_specific(load_file_to_data('audio file path',sampling_rate=16_000),'en') # beware of the audio file sampling rate | |
| with gr.Blocks() as demo: | |
| gr.Markdown("multilingual Speech Recognition") | |
| # with gr.Tab("Persian models"): | |
| # inputs_speech_fa =gr.Audio(sources=["upload"], type="filepath", optional=True,label="Upload your audio:") | |
| # inputs_model_fa =gr.inputs.Radio(label="Language", choices=["ghofrani/common8","SLPL/Sharif-wav2vec2","voidful/wav2vec2-xlsr-multilingual-56"]) | |
| # output_transcribe1_fa = gr.Textbox(label="Transcribed text:") | |
| # output_transcribe1_fa1 = gr.Textbox(label="Transcribed text with timestamps:") | |
| # output_transcribe1_fa2 =gr.HTML(label="") | |
| # transcribe_audio1_fa= gr.Button("Submit") | |
| with gr.Tab("google"): | |
| gr.Markdown("set your speech language") | |
| inputs_speech1 =[ | |
| gr.Audio(sources=["upload"], type="filepath"), | |
| gr.Dropdown(choices=["af-ZA","am-ET","ar-AE","ar-BH","ar-DZ","ar-EG","ar-IL","ar-IQ","ar-JO","ar-KW","ar-LB","ar-MA","ar-MR","ar-OM","ar-PS","ar-QA","ar-SA","ar-TN","ar-YE","az-AZ","bg-BG","bn-BD","bn-IN","bs-BA","ca-ES","cs-CZ","da-DK","de-AT","de-CH","de-DE","el-GR","en-AU","en-CA","en-GB","en-GH","en-HK","en-IE","en-IN","en-KE","en-NG","en-NZ","en-PH","en-PK","en-SG","en-TZ","en-US","en-ZA","es-AR","es-BO","es-CL","es-CO","es-CR","es-DO","es-EC","es-ES","es-GT","es-HN","es-MX","es-NI","es-PA","es-PE","es-PR","es-PY","es-SV","es-US","es-UY","es-VE","et-EE","eu-ES","fa-IR","fi-FI","fil-PH","fr-BE","fr-CA","fr-CH","fr-FR","gl-ES","gu-IN","hi-IN","hr-HR","hu-HU","hy-AM","id-ID","is-IS","it-CH","it-IT","iw-IL","ja-JP","jv-ID","ka-GE","kk-KZ","km-KH","kn-IN","ko-KR","lo-LA","lt-LT","lv-LV","mk-MK","ml-IN","mn-MN","mr-IN","ms-MY","my-MM","ne-NP","nl-BE","nl-NL","no-NO","pa-Guru-IN","pl-PL","pt-BR","pt-PT","ro-RO","ru-RU","si-LK","sk-SK","sl-SI","sq-AL","sr-RS","su-ID","sv-SE","sw-KE","sw-TZ","ta-IN","ta-LK","ta-MY","ta-SG","te-IN","th-TH","tr-TR","uk-UA","ur-IN","ur-PK","uz-UZ","vi-VN","yue-Hant-HK","zh (cmn-Hans-CN)","zh-TW (cmn-Hant-TW)","zu-ZA"] | |
| ,value="fa-IR",label="language code") | |
| ] | |
| output_transcribe1 = gr.Textbox(label="output",show_copy_button=True,rtl=True) | |
| transcribe_audio1_go= gr.Button("Submit") | |
| # transcribe_audio1_fa.click(fn=predict_fa, | |
| # inputs=[inputs_speech_fa ,inputs_model_fa ], | |
| # outputs=[output_transcribe1_fa ,output_transcribe1_fa1,output_transcribe1_fa2 ] ) | |
| transcribe_audio1_go.click(fn=g_rec, | |
| inputs=inputs_speech1 , | |
| outputs=output_transcribe1 ) | |
| if __name__ == "__main__": | |
| demo.launch() | |