Persian_Automatic_Speech_Recognition-asr

Runtime error

App Files Files Community

Persian_Automatic_Speech_Recognition-asr / app.py

karim23657

Update app.py

8d114a9 verified 12 months ago

raw

history blame contribute delete

8.61 kB

	import gradio as gr
	from transformers import pipeline
	from pydub import AudioSegment
	import os
	import speech_recognition as sr


	html_seeker='''<style>
	html, body {
	margin: 0;
	padding: 0;
	min-width: 900px;
	}
	#header {
	/position: fixed;/
	top: 0;
	left: 0;
	height: 50px;
	min-width: 900px;
	line-height: 50px;
	width: 100%;
	background-color: #999;
	box-shadow: 0px 0px 5px 0px rgba(0,0,0,0.5);
	font-family: Helvetica, sans-serif;
	}
	#header, #header a {
	color: white;
	}

	.home {
	margin: 0;
	font-weight: bold;
	text-transform: lowercase;
	width: 100px;
	}
	h4.home {
	margin: 0;
	background: #666;
	padding-left: 25px;
	padding-right: 30px;
	margin-right: 20px;
	float: left;
	text-decoration: none;
	}
	.home:hover a {
	background: #555;
	}
	#audio {
	margin-left: 10px;
	width: 500px;
	display: inline-block;
	}
	#transcript {
	margin: 0 15px;
	margin-bottom: 5em;
	white-space: pre-wrap;
	line-height: 2em;
	max-width: 600px;
	color: #999;
	clear: both;
	margin-top: 75px;
	/direction: rtl;/
	}
	.success {
	color: black;

	}
	.success:hover {
	text-decoration: underline;
	}
	.active {
	color: magenta;
	background-color: yellow;
	}
	#preloader {
	visibility: hidden;
	}


	</style><div id="header">

	<audio id="audio" src="17.mp3" controls="true" ></audio>
	</div>
	</div>


	<div id="transcript" dir="auto"></div>
	<img src="data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7" onload="
	var oldScript = document.querySelector('script#huihiuh6');
	var newScript = document.createElement('script');
	Array.from(oldScript.attributes)
	.forEach( attr => newScript.setAttribute(attr.name, attr.value) );
	newScript.appendChild(document.createTextNode(oldScript.innerHTML));
	oldScript.parentNode.replaceChild(newScript, oldScript);
	">

	<script id="huihiuh6">
	function myFunction543rr(){
	console.log('loaded00000000000000002');
	}
	var $a = document.getElementById("audio");
	$a.src=document.querySelector('audio').src;
	console.log($a);
	window.onkeydown = function(ev) {
	if(ev.keyCode == 32) {
	ev.preventDefault();
	$a.pause();
	}
	}
	var $trans = document.getElementById("transcript");
	var wds = [];
	var cur_wd;

	function highlight_word() {
	var t = $a.currentTime;
	// XXX: O(N); use binary search
	var hits = wds.filter(function(x) {
	return (t - x['timestamp']['0']) > 0.01 && (x['timestamp']['1'] - t) > 0.01;
	}, wds);
	var next_wd = hits[hits.length - 1];

	if(cur_wd != next_wd) {
	var active = document.querySelectorAll('.active');
	for(var i = 0; i < active.length; i++) {
	active[i].classList.remove('active');
	}
	if(next_wd && next_wd.$div) {
	next_wd.$div.classList.add('active');
	//render_phones(next_wd);
	}
	}
	cur_wd = next_wd;
	//highlight_phone(t);

	window.requestAnimationFrame(highlight_word);
	}
	window.requestAnimationFrame(highlight_word);

	$trans.innerHTML = "Loading...";

	function render(ret) {
	wds = ret['chunks'] \|\| [];
	transcript = ret['text'];

	$trans.innerHTML = '';

	var currentOffset = 0;

	wds.forEach(function(wd) {


	var $wd = document.createElement('span');
	var txt = wd['text'];
	var $wdText = document.createTextNode(txt);
	$wd.appendChild($wdText);
	wd.$div = $wd;
	$wd.className = 'success';

	$wd.onclick = function() {
	console.log(wd['timestamp']['0']);
	$a.currentTime = wd['timestamp']['0'];
	$a.play();
	};
	$trans.appendChild($wd);
	$trans.appendChild(document.createTextNode(' '));
	});


	}





	function update() {
	if(INLINE_JSON) {
	// We want this to work from file:/// domains, so we provide a
	// mechanism for inlining the alignment data.
	render(INLINE_JSON);
	}
	}

	var INLINE_JSON='''
	html_seeker2=''';
	update();
	</script>'''

	# model_name = "voidful/wav2vec2-xlsr-multilingual-56"
	# model0 = pipeline(task="automatic-speech-recognition",
	# model=model_name)


	# model_name = "SLPL/Sharif-wav2vec2"
	# model2 = pipeline(task="automatic-speech-recognition",
	# model=model_name)
	# model_name = "ghofrani/common8"
	# model1 = pipeline(task="automatic-speech-recognition",
	# model=model_name)

	import json
	def predict_fa(speech,model):
	if model== "SLPL/Sharif-wav2vec2":
	text = model2(speech,return_timestamps="word" )
	elif model== "ghofrani/common8":
	text = model1(speech,return_timestamps="word" )
	elif model== "voidful/wav2vec2-xlsr-multilingual-56":
	text = model0(speech,return_timestamps="word" )

	return [text['text'],json.dumps(text),html_seeker+json.dumps(text)+html_seeker2]


	def convert_to_wav(filename):
	filenameObj=os.path.splitext(filename)
	audio = AudioSegment.from_file(filename,format=filenameObj[1].replace(".",""))
	new_filename = filenameObj[0] + ".wav"
	while os.path.exists(new_filename):
	new_filename = os.path.splitext(new_filename)[0]+"(1)"+ ".wav"
	audio.export(new_filename, format="wav")
	print(f"Converting {filename} to {new_filename}...")
	return new_filename
	def g_rec(audio_File ,language):
	r = sr.Recognizer()
	print(audio_File)

	#if not os.path.splitext(audio_File)[1]==".wav":
	# audio_File=convert_to_wav(audio_File)
	hellow=sr.AudioFile(audio_File)
	with hellow as source:
	audio = r.record(source)
	try:
	s = r.recognize_google(audio,language =language)
	res= s
	except Exception as e:
	res= "Exception: "+str(e)
	return res
	# Export file as .wav

	#predict(load_file_to_data('audio file path',sampling_rate=16_000)) # beware of the audio file sampling rate

	#predict_lang_specific(load_file_to_data('audio file path',sampling_rate=16_000),'en') # beware of the audio file sampling rate
	with gr.Blocks() as demo:
	gr.Markdown("multilingual Speech Recognition")

	# with gr.Tab("Persian models"):
	# inputs_speech_fa =gr.Audio(sources=["upload"], type="filepath", optional=True,label="Upload your audio:")
	# inputs_model_fa =gr.inputs.Radio(label="Language", choices=["ghofrani/common8","SLPL/Sharif-wav2vec2","voidful/wav2vec2-xlsr-multilingual-56"])
	# output_transcribe1_fa = gr.Textbox(label="Transcribed text:")
	# output_transcribe1_fa1 = gr.Textbox(label="Transcribed text with timestamps:")
	# output_transcribe1_fa2 =gr.HTML(label="")
	# transcribe_audio1_fa= gr.Button("Submit")
	with gr.Tab("google"):
	gr.Markdown("set your speech language")
	inputs_speech1 =[
	gr.Audio(sources=["upload"], type="filepath"),
	gr.Dropdown(choices=["af-ZA","am-ET","ar-AE","ar-BH","ar-DZ","ar-EG","ar-IL","ar-IQ","ar-JO","ar-KW","ar-LB","ar-MA","ar-MR","ar-OM","ar-PS","ar-QA","ar-SA","ar-TN","ar-YE","az-AZ","bg-BG","bn-BD","bn-IN","bs-BA","ca-ES","cs-CZ","da-DK","de-AT","de-CH","de-DE","el-GR","en-AU","en-CA","en-GB","en-GH","en-HK","en-IE","en-IN","en-KE","en-NG","en-NZ","en-PH","en-PK","en-SG","en-TZ","en-US","en-ZA","es-AR","es-BO","es-CL","es-CO","es-CR","es-DO","es-EC","es-ES","es-GT","es-HN","es-MX","es-NI","es-PA","es-PE","es-PR","es-PY","es-SV","es-US","es-UY","es-VE","et-EE","eu-ES","fa-IR","fi-FI","fil-PH","fr-BE","fr-CA","fr-CH","fr-FR","gl-ES","gu-IN","hi-IN","hr-HR","hu-HU","hy-AM","id-ID","is-IS","it-CH","it-IT","iw-IL","ja-JP","jv-ID","ka-GE","kk-KZ","km-KH","kn-IN","ko-KR","lo-LA","lt-LT","lv-LV","mk-MK","ml-IN","mn-MN","mr-IN","ms-MY","my-MM","ne-NP","nl-BE","nl-NL","no-NO","pa-Guru-IN","pl-PL","pt-BR","pt-PT","ro-RO","ru-RU","si-LK","sk-SK","sl-SI","sq-AL","sr-RS","su-ID","sv-SE","sw-KE","sw-TZ","ta-IN","ta-LK","ta-MY","ta-SG","te-IN","th-TH","tr-TR","uk-UA","ur-IN","ur-PK","uz-UZ","vi-VN","yue-Hant-HK","zh (cmn-Hans-CN)","zh-TW (cmn-Hant-TW)","zu-ZA"]
	,value="fa-IR",label="language code")
	]
	output_transcribe1 = gr.Textbox(label="output",show_copy_button=True,rtl=True)
	transcribe_audio1_go= gr.Button("Submit")

	# transcribe_audio1_fa.click(fn=predict_fa,
	# inputs=[inputs_speech_fa ,inputs_model_fa ],
	# outputs=[output_transcribe1_fa ,output_transcribe1_fa1,output_transcribe1_fa2 ] )

	transcribe_audio1_go.click(fn=g_rec,
	inputs=inputs_speech1 ,
	outputs=output_transcribe1 )


	if __name__ == "__main__":
	demo.launch()