erastorgueva-nv commited on
Commit
f71078c
·
1 Parent(s): b43c4a1

styling, move frame_asr init to transcribe function

Browse files

Signed-off-by: Elena Rastorgueva <[email protected]>

Files changed (1) hide show
  1. app.py +29 -20
app.py CHANGED
@@ -32,20 +32,6 @@ model.cfg.preprocessor.pad_to = 0
32
  feature_stride = model.cfg.preprocessor['window_stride']
33
  model_stride_in_secs = feature_stride * 8 # 8 = model stride, which is 8 for FastConformer
34
 
35
- frame_asr_10s = FrameBatchMultiTaskAED(
36
- asr_model=model,
37
- frame_len=10.0,
38
- total_buffer=10.0,
39
- batch_size=16,
40
- )
41
-
42
- frame_asr_40s = FrameBatchMultiTaskAED(
43
- asr_model=model,
44
- frame_len=40.0,
45
- total_buffer=40.0,
46
- batch_size=16,
47
- )
48
-
49
  amp_dtype = torch.float16
50
 
51
  def convert_audio(audio_filepath, tmpdir, utt_id):
@@ -139,16 +125,23 @@ def transcribe(audio_filepath, src_lang, tgt_lang, pnc, gen_ts):
139
  <html lang="en">
140
  <head>
141
  <style>
142
-
143
  .transcript {
144
  font-family: Arial, sans-serif;
145
  line-height: 1.6;
 
146
  }
147
  .timestamp {
148
  color: gray;
149
  font-size: 0.8em;
150
  margin-right: 5px;
151
  }
 
 
 
 
 
 
 
152
  </style>
153
  </head>
154
  <body>
@@ -160,8 +153,15 @@ def transcribe(audio_filepath, src_lang, tgt_lang, pnc, gen_ts):
160
  if duration < 10:
161
  output = model.transcribe(manifest_filepath)
162
  else:
 
 
 
 
 
 
 
163
  output = get_buffered_pred_feat_multitaskAED(
164
- frame_asr_10s,
165
  model.cfg.preprocessor,
166
  model_stride_in_secs,
167
  model.device,
@@ -172,14 +172,14 @@ def transcribe(audio_filepath, src_lang, tgt_lang, pnc, gen_ts):
172
  # process output to get word and segment level timestamps
173
  word_level_timestamps = output[0].timestamp["word"]
174
 
175
- output_html += "<p><b>Transcript with word-level timestamps (in seconds)</b></p>\n"
176
  output_html += "<div class='transcript'>\n"
177
  for entry in word_level_timestamps:
178
  output_html += f'<span>{entry["word"]} <span class="timestamp">({entry["start"]:.2f}-{entry["end"]:.2f})</span></span>\n'
179
  output_html += "</div>\n"
180
 
181
  segment_level_timestamps = output[0].timestamp["segment"]
182
- output_html += "<p><b>Transcript with segment-level timestamps (in seconds)</b></p>\n"
183
  output_html += "<div class='transcript'>\n"
184
  for entry in segment_level_timestamps:
185
  output_html += f'<span>{entry["segment"]} <span class="timestamp">({entry["start"]:.2f}-{entry["end"]:.2f})</span></span>\n'
@@ -191,8 +191,14 @@ def transcribe(audio_filepath, src_lang, tgt_lang, pnc, gen_ts):
191
  output = model.transcribe(manifest_filepath)
192
 
193
  else: # do buffered inference
 
 
 
 
 
 
194
  output = get_buffered_pred_feat_multitaskAED(
195
- frame_asr_40s,
196
  model.cfg.preprocessor,
197
  model_stride_in_secs,
198
  model.device,
@@ -200,7 +206,10 @@ def transcribe(audio_filepath, src_lang, tgt_lang, pnc, gen_ts):
200
  filepaths=None,
201
  )
202
 
203
- output_html += "<p><b>Transcript</b></p>\n"
 
 
 
204
  output_text = output[0].text
205
  output_html += f'<div class="transcript">{output_text}</div>\n'
206
 
 
32
  feature_stride = model.cfg.preprocessor['window_stride']
33
  model_stride_in_secs = feature_stride * 8 # 8 = model stride, which is 8 for FastConformer
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  amp_dtype = torch.float16
36
 
37
  def convert_audio(audio_filepath, tmpdir, utt_id):
 
125
  <html lang="en">
126
  <head>
127
  <style>
 
128
  .transcript {
129
  font-family: Arial, sans-serif;
130
  line-height: 1.6;
131
+ margin: 20px 0;
132
  }
133
  .timestamp {
134
  color: gray;
135
  font-size: 0.8em;
136
  margin-right: 5px;
137
  }
138
+ .heading {
139
+ color: #2c3e50;
140
+ font-family: Arial, sans-serif;
141
+ font-weight: bold;
142
+ margin: 15px 0 8px 0;
143
+ border-bottom: 1px solid #eee;
144
+ }
145
  </style>
146
  </head>
147
  <body>
 
153
  if duration < 10:
154
  output = model.transcribe(manifest_filepath)
155
  else:
156
+ frame_asr = FrameBatchMultiTaskAED(
157
+ asr_model=model,
158
+ frame_len=10.0,
159
+ total_buffer=10.0,
160
+ batch_size=16,
161
+ )
162
+
163
  output = get_buffered_pred_feat_multitaskAED(
164
+ frame_asr,
165
  model.cfg.preprocessor,
166
  model_stride_in_secs,
167
  model.device,
 
172
  # process output to get word and segment level timestamps
173
  word_level_timestamps = output[0].timestamp["word"]
174
 
175
+ output_html += "<div class='heading'>Transcript with word-level timestamps (in seconds)</div>\n"
176
  output_html += "<div class='transcript'>\n"
177
  for entry in word_level_timestamps:
178
  output_html += f'<span>{entry["word"]} <span class="timestamp">({entry["start"]:.2f}-{entry["end"]:.2f})</span></span>\n'
179
  output_html += "</div>\n"
180
 
181
  segment_level_timestamps = output[0].timestamp["segment"]
182
+ output_html += "<div class='heading'>Transcript with segment-level timestamps (in seconds)</div>\n"
183
  output_html += "<div class='transcript'>\n"
184
  for entry in segment_level_timestamps:
185
  output_html += f'<span>{entry["segment"]} <span class="timestamp">({entry["start"]:.2f}-{entry["end"]:.2f})</span></span>\n'
 
191
  output = model.transcribe(manifest_filepath)
192
 
193
  else: # do buffered inference
194
+ frame_asr = FrameBatchMultiTaskAED(
195
+ asr_model=model,
196
+ frame_len=40.0,
197
+ total_buffer=40.0,
198
+ batch_size=16,
199
+ )
200
  output = get_buffered_pred_feat_multitaskAED(
201
+ frame_asr,
202
  model.cfg.preprocessor,
203
  model_stride_in_secs,
204
  model.device,
 
206
  filepaths=None,
207
  )
208
 
209
+ if taskname == "asr":
210
+ output_html += "<div class='heading'>Transcript</div>\n"
211
+ else:
212
+ output_html += "<div class='heading'>Translated Text</div>\n"
213
  output_text = output[0].text
214
  output_html += f'<div class="transcript">{output_text}</div>\n'
215