Spaces:

nvidia
/

canary-1b-flash

Running on Zero

App Files Files Community

erastorgueva-nv commited on 9 days ago

Commit

b43c4a1

1 Parent(s): 9826ad7

add functionality to generate and display timestamps if transcribing

Browse files

Files changed (1) hide show

app.py +135 -51

app.py CHANGED Viewed

@@ -14,7 +14,7 @@ from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTask
 from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
 SAMPLE_RATE = 16000 # Hz
-MAX_AUDIO_MINUTES = 10 # wont try to transcribe if longer than this
 model = ASRModel.from_pretrained("nvidia/canary-1b-flash")
 model.eval()
@@ -32,7 +32,14 @@ model.cfg.preprocessor.pad_to = 0
 feature_stride = model.cfg.preprocessor['window_stride']
 model_stride_in_secs = feature_stride * 8 # 8 = model stride, which is 8 for FastConformer
-frame_asr = FrameBatchMultiTaskAED(
 	asr_model=model,
 	frame_len=40.0,
 	total_buffer=40.0,
@@ -69,9 +76,8 @@ def convert_audio(audio_filepath, tmpdir, utt_id):
 	return out_filename, duration
 @spaces.GPU
-def transcribe(audio_filepath, src_lang, tgt_lang, pnc):
 	if audio_filepath is None:
 		raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
@@ -104,8 +110,9 @@ def transcribe(audio_filepath, src_lang, tgt_lang, pnc):
 		else:
 			taskname = "s2t_translation"
-		# update pnc variable to be "yes" or "no"
 		pnc = "yes" if pnc else "no"
 		# make manifest file and save
 		manifest_data = {
@@ -116,6 +123,7 @@ def transcribe(audio_filepath, src_lang, tgt_lang, pnc):
 			"pnc": pnc,
 			"answer": "predict",
 			"duration": str(duration),
 		}
 		manifest_filepath = os.path.join(tmpdir, f'{utt_id}.json')
@@ -124,34 +132,95 @@ def transcribe(audio_filepath, src_lang, tgt_lang, pnc):
 			line = json.dumps(manifest_data)
 			fout.write(line + '\n')
-		# call transcribe, passing in manifest filepath
-		if duration < 40:
-			output_text = model.transcribe(manifest_filepath)[0].text
-		else: # do buffered inference
-			with torch.cuda.amp.autocast(dtype=amp_dtype): # TODO: make it work if no cuda
-				with torch.no_grad():
-					hyps = get_buffered_pred_feat_multitaskAED(
-						frame_asr,
-						model.cfg.preprocessor,
-						model_stride_in_secs,
-						model.device,
-						manifest=manifest_filepath,
-						filepaths=None,
-					)
-					output_text = hyps[0].text
-	return output_text.strip()
 # add logic to make sure dropdown menus only suggest valid combos
-def on_src_or_tgt_lang_change(src_lang_value, tgt_lang_value, pnc_value):
 	"""Callback function for when src_lang or tgt_lang dropdown menus are changed.
 	Args:
-		src_lang_value(string), tgt_lang_value (string), pnc_value(bool) - the current
 			chosen "values" of each Gradio component
 	Returns:
-		src_lang, tgt_lang, pnc - these are the new Gradio components that will be displayed
 	Note: I found the required logic is easier to understand if you think about the possible src & tgt langs as
 	a matrix, e.g. with English, Spanish, French, German as the langs, and only transcription in the same language,
@@ -225,30 +294,38 @@ def on_src_or_tgt_lang_change(src_lang_value, tgt_lang_value, pnc_value):
 			value=tgt_lang_value,
 			label="Transcribe in language:"
 		)
-	# let pnc be anything if src_lang_value == tgt_lang_value, else fix to True
 	if src_lang_value == tgt_lang_value:
 		pnc = gr.Checkbox(
 			value=pnc_value,
-			label="Punctuation & Capitalization in transcript?",
 			interactive=True
 		)
 	else:
 		pnc = gr.Checkbox(
 			value=True,
-			label="Punctuation & Capitalization in transcript?",
 			interactive=False
 		)
-	return src_lang, tgt_lang, pnc
 with gr.Blocks(
 	title="NeMo Canary 1B Flash Model",
 	css="""
 		textarea { font-size: 18px;}
-		#model_output_text_box span {
-			font-size: 18px;
-			font-weight: bold;
-		}
 	""",
 	theme=gr.themes.Default(text_size=gr.themes.sizes.text_lg) # make text slightly bigger (default is text_md )
 ) as demo:
@@ -260,22 +337,25 @@ with gr.Blocks(
 			gr.HTML(
 				"<p><b>Step 1:</b> Upload an audio file or record with your microphone.</p>"
-				"<p style='color: #A0A0A0;'>This demo supports audio files up to 10 mins long. "
 				"You can transcribe longer files locally with this NeMo "
 				"<a href='https://github.com/NVIDIA/NeMo/blob/main/examples/asr/asr_chunked_inference/aed/speech_to_text_aed_chunked_infer.py'>script</a>.</p>"
 			)
 			audio_file = gr.Audio(sources=["microphone", "upload"], type="filepath")
-			gr.HTML("<p><b>Step 2:</b> Choose the input and output language.</p>")
-			src_lang = gr.Dropdown(
-				choices=["English", "Spanish", "French", "German"],
-				value="English",
-				label="Input audio is spoken in:"
 			)
 			with gr.Column():
 				tgt_lang = gr.Dropdown(
 					choices=["English", "Spanish", "French", "German"],
 					value="English",
@@ -283,7 +363,11 @@ with gr.Blocks(
 				)
 				pnc = gr.Checkbox(
 					value=True,
-					label="Punctuation & Capitalization in transcript?",
 				)
 		with gr.Column():
@@ -295,11 +379,11 @@ with gr.Blocks(
 				variant="primary", # make "primary" so it stands out (default is "secondary")
 			)
-			model_output_text_box = gr.Textbox(
 				label="Model Output",
-				elem_id="model_output_text_box",
 			)
 	with gr.Row():
 		gr.HTML(
@@ -311,20 +395,20 @@ with gr.Blocks(
 	go_button.click(
 		fn=transcribe,
-		inputs = [audio_file, src_lang, tgt_lang, pnc],
-		outputs = [model_output_text_box]
 	)
 	# call on_src_or_tgt_lang_change whenever src_lang or tgt_lang dropdown menus are changed
 	src_lang.change(
 		fn=on_src_or_tgt_lang_change,
-		inputs=[src_lang, tgt_lang, pnc],
-		outputs=[src_lang, tgt_lang, pnc],
 	)
 	tgt_lang.change(
 		fn=on_src_or_tgt_lang_change,
-		inputs=[src_lang, tgt_lang, pnc],
-		outputs=[src_lang, tgt_lang, pnc],
 	)

 from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
 SAMPLE_RATE = 16000 # Hz
+MAX_AUDIO_MINUTES = 30 # wont try to transcribe if longer than this
 model = ASRModel.from_pretrained("nvidia/canary-1b-flash")
 model.eval()
 feature_stride = model.cfg.preprocessor['window_stride']
 model_stride_in_secs = feature_stride * 8 # 8 = model stride, which is 8 for FastConformer
+frame_asr_10s = FrameBatchMultiTaskAED(
+	asr_model=model,
+	frame_len=10.0,
+	total_buffer=10.0,
+	batch_size=16,
+)
+frame_asr_40s = FrameBatchMultiTaskAED(
 	asr_model=model,
 	frame_len=40.0,
 	total_buffer=40.0,
 	return out_filename, duration
 @spaces.GPU
+def transcribe(audio_filepath, src_lang, tgt_lang, pnc, gen_ts):
 	if audio_filepath is None:
 		raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
 		else:
 			taskname = "s2t_translation"
+		# update pnc and gen_ts variables to be "yes" or "no"
 		pnc = "yes" if pnc else "no"
+		gen_ts = "yes" if gen_ts else "no"
 		# make manifest file and save
 		manifest_data = {
 			"pnc": pnc,
 			"answer": "predict",
 			"duration": str(duration),
+			"timestamp": gen_ts,
 		}
 		manifest_filepath = os.path.join(tmpdir, f'{utt_id}.json')
 			line = json.dumps(manifest_data)
 			fout.write(line + '\n')
+		# setup beginning of output html
+		output_html = '''
+			<!DOCTYPE html>
+			<html lang="en">
+			<head>
+				<style>
+					.transcript {
+						font-family: Arial, sans-serif;
+						line-height: 1.6;
+					}
+					.timestamp {
+						color: gray;
+						font-size: 0.8em;
+						margin-right: 5px;
+					}
+				</style>
+			</head>
+			<body>
+			'''
+		if gen_ts == "yes": # if will generate timestamps
+			if duration < 10:
+				output = model.transcribe(manifest_filepath)
+			else:
+				output = get_buffered_pred_feat_multitaskAED(
+					frame_asr_10s,
+					model.cfg.preprocessor,
+					model_stride_in_secs,
+					model.device,
+					manifest=manifest_filepath,
+					filepaths=None,
+				)
+			# process output to get word and segment level timestamps
+			word_level_timestamps = output[0].timestamp["word"]
+			output_html += "<p><b>Transcript with word-level timestamps (in seconds)</b></p>\n"
+			output_html += "<div class='transcript'>\n"
+			for entry in word_level_timestamps:
+				output_html += f'<span>{entry["word"]} <span class="timestamp">({entry["start"]:.2f}-{entry["end"]:.2f})</span></span>\n'
+			output_html += "</div>\n"
+			segment_level_timestamps = output[0].timestamp["segment"]
+			output_html += "<p><b>Transcript with segment-level timestamps (in seconds)</b></p>\n"
+			output_html += "<div class='transcript'>\n"
+			for entry in segment_level_timestamps:
+				output_html += f'<span>{entry["segment"]} <span class="timestamp">({entry["start"]:.2f}-{entry["end"]:.2f})</span></span>\n'
+			output_html += "</div>\n"
+		else: # if will not generate timestamps
+			if duration < 40:
+				output = model.transcribe(manifest_filepath)
+			else: # do buffered inference
+				output = get_buffered_pred_feat_multitaskAED(
+					frame_asr_40s,
+					model.cfg.preprocessor,
+					model_stride_in_secs,
+					model.device,
+					manifest=manifest_filepath,
+					filepaths=None,
+				)
+			output_html += "<p><b>Transcript</b></p>\n"
+			output_text = output[0].text
+			output_html += f'<div class="transcript">{output_text}</div>\n'
+	output_html += '''
+		</div>
+		</body>
+		</html>
+		'''
+	return output_html
 # add logic to make sure dropdown menus only suggest valid combos
+def on_src_or_tgt_lang_change(src_lang_value, tgt_lang_value, pnc_value, gen_ts_value):
 	"""Callback function for when src_lang or tgt_lang dropdown menus are changed.
 	Args:
+		src_lang_value(string), tgt_lang_value (string), pnc_value(bool), gen_ts_value(bool) - the current
 			chosen "values" of each Gradio component
 	Returns:
+		src_lang, tgt_lang, pnc, gen_ts - these are the new Gradio components that will be displayed
 	Note: I found the required logic is easier to understand if you think about the possible src & tgt langs as
 	a matrix, e.g. with English, Spanish, French, German as the langs, and only transcription in the same language,
 			value=tgt_lang_value,
 			label="Transcribe in language:"
 		)
+	# if src_lang_value == tgt_lang_value then pnc and gen_ts can be anything
+	# else, fix pnc to True and gen_ts to False
 	if src_lang_value == tgt_lang_value:
 		pnc = gr.Checkbox(
 			value=pnc_value,
+			label="Punctuation & Capitalization in model output?",
+			interactive=True
+		)
+		gen_ts = gr.Checkbox(
+			value=gen_ts_value,
+			label="Generate timestamps?",
 			interactive=True
 		)
 	else:
 		pnc = gr.Checkbox(
 			value=True,
+			label="Punctuation & Capitalization in model output?",
+			interactive=False
+		)
+		gen_ts = gr.Checkbox(
+			value=False,
+			label="Generate timestamps?",
 			interactive=False
 		)
+	return src_lang, tgt_lang, pnc, gen_ts
 with gr.Blocks(
 	title="NeMo Canary 1B Flash Model",
 	css="""
 		textarea { font-size: 18px;}
 	""",
 	theme=gr.themes.Default(text_size=gr.themes.sizes.text_lg) # make text slightly bigger (default is text_md )
 ) as demo:
 			gr.HTML(
 				"<p><b>Step 1:</b> Upload an audio file or record with your microphone.</p>"
+				f"<p style='color: #A0A0A0;'>This demo supports audio files up to {MAX_AUDIO_MINUTES} mins long. "
 				"You can transcribe longer files locally with this NeMo "
 				"<a href='https://github.com/NVIDIA/NeMo/blob/main/examples/asr/asr_chunked_inference/aed/speech_to_text_aed_chunked_infer.py'>script</a>.</p>"
 			)
 			audio_file = gr.Audio(sources=["microphone", "upload"], type="filepath")
+			gr.HTML(
+				"<p><b>Step 2:</b> Choose the input and output language.</p>"
+				"<p style='color: #A0A0A0;'>If input & output languages are the same, you can also toggle generating punctuation & capitalization and timestamps.</p>"
 			)
 			with gr.Column():
+				src_lang = gr.Dropdown(
+					choices=["English", "Spanish", "French", "German"],
+					value="English",
+					label="Input audio is spoken in:"
+				)
 				tgt_lang = gr.Dropdown(
 					choices=["English", "Spanish", "French", "German"],
 					value="English",
 				)
 				pnc = gr.Checkbox(
 					value=True,
+					label="Punctuation & Capitalization in model output?",
+				)
+				gen_ts = gr.Checkbox(
+					value=True,
+					label="Generate timestamps?",
 				)
 		with gr.Column():
 				variant="primary", # make "primary" so it stands out (default is "secondary")
 			)
+			model_output_html = gr.HTML(
 				label="Model Output",
 			)
 	with gr.Row():
 		gr.HTML(
 	go_button.click(
 		fn=transcribe,
+		inputs = [audio_file, src_lang, tgt_lang, pnc, gen_ts],
+		outputs = [model_output_html]
 	)
 	# call on_src_or_tgt_lang_change whenever src_lang or tgt_lang dropdown menus are changed
 	src_lang.change(
 		fn=on_src_or_tgt_lang_change,
+		inputs=[src_lang, tgt_lang, pnc, gen_ts],
+		outputs=[src_lang, tgt_lang, pnc, gen_ts],
 	)
 	tgt_lang.change(
 		fn=on_src_or_tgt_lang_change,
+		inputs=[src_lang, tgt_lang, pnc, gen_ts],
+		outputs=[src_lang, tgt_lang, pnc, gen_ts],
 	)