AKIRA commited on
Commit
ad71343
·
1 Parent(s): b20b423

feat: Optimize ASR speed with ONNX and audio compression

Browse files
Files changed (3) hide show
  1. app.py +152 -101
  2. pages/index/index.js +81 -167
  3. requirements.txt +7 -0
app.py CHANGED
@@ -1,125 +1,176 @@
1
  import gradio as gr
2
- from transformers import pipeline
 
3
  import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
- # 1. Load Models (this will happen only once when the app starts)
6
- print("Loading models...")
7
 
8
- # ASR Pipeline
9
- asr_pipeline = pipeline(
10
- "automatic-speech-recognition",
11
- model="openai/whisper-small",
12
- torch_dtype=torch.float16, # Use float16 for faster inference
13
- device="cpu" # Specify CPU device
14
- )
15
 
16
- # Translation Pipelines
 
 
 
 
 
 
17
  translators = {
18
  "en-zh": pipeline("translation", model="Helsinki-NLP/opus-mt-en-zh"),
19
- "zh-en": pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en"),
20
- "en-ja": pipeline("translation", model="Helsinki-NLP/opus-mt-en-jap"), # Corrected model name
21
  "ja-en": pipeline("translation", model="Helsinki-NLP/opus-mt-ja-en"),
22
- "en-ko": pipeline("translation", model="Helsinki-NLP/opus-mt-tc-big-en-ko"), # Corrected model name
23
  "ko-en": pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en"),
24
  }
25
 
26
- print("Models loaded successfully.")
27
-
28
- # 2. Define Processing Functions
29
 
30
- def transcribe_audio(audio_file):
31
- print(f"Received audio_file (binary): {audio_file}")
32
- if audio_file is None:
33
- print("Audio file is None.")
34
- return ""
35
  try:
36
- import tempfile
37
- import soundfile as sf
 
 
 
 
 
38
 
39
- # Gradio passes binary data as a file-like object or path to a temp file
40
- # If it's a path, use it directly. If it's binary data, write to temp file.
41
- if isinstance(audio_file, str) and os.path.exists(audio_file):
42
- audio_path = audio_file
43
- else:
44
- # Assume it's binary data (bytes or similar)
45
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
46
- tmp_file.write(audio_file)
47
- audio_path = tmp_file.name
48
- print(f"Wrote binary data to temp file: {audio_path}")
49
-
50
- print(f"Audio file path exists: {os.path.exists(audio_path)}")
51
- print(f"Audio file size: {os.path.getsize(audio_path)} bytes")
52
-
53
- # The pipeline expects a file path
54
- text = asr_pipeline(audio_path)["text"]
55
- print(f"ASR result: {text}")
56
 
57
- # Clean up temporary file
58
- if not isinstance(audio_file, str) or not os.path.exists(audio_file):
59
- os.remove(audio_path)
60
 
61
- return text
 
62
  except Exception as e:
63
- print(f"Error in ASR: {e}")
64
- import traceback
65
- traceback.print_exc()
66
- return f"Error in ASR: {e}"
67
 
68
  def translate_text(text, source_lang, target_lang):
69
- print(f"Translating '{text}' from {source_lang} to {target_lang}")
70
- if not text:
71
- return ""
72
-
73
- # Direct translation if possible
74
- if f"{source_lang}-{target_lang}" in translators:
75
- translator = translators[f"{source_lang}-{target_lang}"]
76
- translated_text = translator(text)[0]['translation_text']
77
- # Bridge translation via English
78
- elif source_lang != 'en' and target_lang != 'en':
79
- to_english_translator = translators[f"{source_lang}-en"]
80
- english_text = to_english_translator(text)[0]['translation_text']
 
 
 
 
 
 
 
81
 
82
- from_english_translator = translators[f"en-{target_lang}"]
83
- translated_text = from_english_translator(english_text)[0]['translation_text']
84
- else:
85
- return "Translation route not supported"
86
-
87
- print(f"Translation result: {translated_text}")
88
- return translated_text
89
-
90
- # 3. Create Gradio Interface
91
- # Define ASR Interface
92
- asr_interface = gr.Interface(
93
- fn=transcribe_audio,
94
- inputs=gr.File(label="Audio Input"),
95
- outputs="text",
96
- title="ASR API",
97
- description="Transcribe audio to text."
98
- )
99
-
100
- # Define Translation Interface
101
- translate_interface = gr.Interface(
102
- fn=translate_text,
103
- inputs=[
104
- gr.Textbox(label="Input Text"),
105
- gr.Dropdown(["en", "zh", "ja", "ko"], label="Source Language"),
106
- gr.Dropdown(["en", "zh", "ja", "ko"], label="Target Language")
107
- ],
108
- outputs="text",
109
- title="Translation API",
110
- description="Translate text between supported languages."
111
- )
112
-
113
- # Combine interfaces into a Blocks app for multiple API endpoints
114
- with gr.Blocks() as demo:
115
- gr.Markdown("## All-in-One ASR and Translation API")
 
 
 
 
 
 
116
 
117
- with gr.Tab("ASR"):
118
- asr_interface.render()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
- with gr.Tab("Translate"):
121
- translate_interface.render()
122
 
123
- # 4. Launch the App
124
  if __name__ == "__main__":
125
- demo.launch(share=False, server_name="0.0.0.0", server_port=7860)
 
1
  import gradio as gr
2
+ from transformers import pipeline, AutoProcessor
3
+ from optimum.onnxruntime import ORTModelForSpeechSeq2Seq
4
  import torch
5
+ import os
6
+ import base64
7
+ import tempfile
8
+ from fastapi import FastAPI, Request
9
+ from fastapi.responses import JSONResponse
10
+ import uvicorn
11
+ import deepl
12
+ from dotenv import load_dotenv
13
+ import soundfile as sf
14
+
15
+ # --- Load environment variables and initialize DeepL ---
16
+ load_dotenv()
17
+
18
+ DEEPL_AUTH_KEY = os.getenv("DEEPL_AUTH_KEY")
19
+ deepl_translator = None
20
+ if DEEPL_AUTH_KEY:
21
+ try:
22
+ deepl_translator = deepl.Translator(DEEPL_AUTH_KEY)
23
+ print("DeepL translator initialized successfully.")
24
+ except Exception as e:
25
+ print(f"Error initializing DeepL translator: {e}")
26
+ print("DeepL will be unavailable.")
27
+ else:
28
+ print("DEEPL_AUTH_KEY not found. DeepL will be unavailable.")
29
+ # --- End ---
30
+
31
 
32
+ # 1. Load Models
33
+ print("Loading all models... This will take some time on startup.")
34
 
35
+ # ASR Model - Using a CPU-optimized ONNX model for speed
36
+ print("Loading optimized ASR model...")
37
+ asr_model_id = "openai/whisper-base"
 
 
 
 
38
 
39
+ # Load the model and processor using Optimum for ONNX runtime acceleration
40
+ asr_model = ORTModelForSpeechSeq2Seq.from_pretrained(asr_model_id, provider="CPUExecutionProvider")
41
+ asr_processor = AutoProcessor.from_pretrained(asr_model_id)
42
+ print("Optimized ASR model loaded.")
43
+
44
+
45
+ # Translation Pipelines - Reverting to the 6 core, absolutely reliable models
46
  translators = {
47
  "en-zh": pipeline("translation", model="Helsinki-NLP/opus-mt-en-zh"),
48
+ "zh-en": pipeline("translation", model="Varine/opus-mt-zh-en-model"),
49
+ "en-ja": pipeline("translation", model="staka/fugumt-en-ja"),
50
  "ja-en": pipeline("translation", model="Helsinki-NLP/opus-mt-ja-en"),
51
+ "en-ko": pipeline("translation", model="Helsinki-NLP/opus-mt-tc-big-en-ko"),
52
  "ko-en": pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en"),
53
  }
54
 
55
+ print("All models loaded successfully.")
 
 
56
 
57
+ # 2. Define Core Logic Functions
58
+ def transcribe_audio(audio_bytes):
 
 
 
59
  try:
60
+ # Use a temporary file to handle the audio bytes
61
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
62
+ tmp_file.write(audio_bytes)
63
+ audio_path = tmp_file.name
64
+
65
+ # Read the audio file and process it
66
+ audio_input, sample_rate = sf.read(audio_path)
67
 
68
+ # Ensure the audio is in the correct format (mono, 16kHz)
69
+ if audio_input.ndim > 1:
70
+ audio_input = audio_input.mean(axis=1) # to mono
71
+ if sample_rate != 16000:
72
+ # This is a placeholder for resampling. For now, we assume frontend sends 16kHz.
73
+ pass
74
+
75
+ # Process audio and generate token IDs
76
+ input_features = asr_processor(audio_input, sampling_rate=16000, return_tensors="pt").input_features
77
+ predicted_ids = asr_model.generate(input_features)
 
 
 
 
 
 
 
78
 
79
+ # Decode the token IDs to text
80
+ text = asr_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
 
81
 
82
+ os.remove(audio_path)
83
+ return text, None
84
  except Exception as e:
85
+ # Clean up the temp file in case of an error
86
+ if 'audio_path' in locals() and os.path.exists(audio_path):
87
+ os.remove(audio_path)
88
+ return None, str(e)
89
 
90
  def translate_text(text, source_lang, target_lang):
91
+ if not text or not source_lang or not target_lang:
92
+ return "", None
93
+ if source_lang == target_lang:
94
+ return text, None
95
+
96
+ # --- DeepL Hybrid Logic ---
97
+ if source_lang == 'zh' and target_lang == 'ja' and deepl_translator:
98
+ print("Attempting translation with DeepL for zh -> ja")
99
+ try:
100
+ result = deepl_translator.translate_text(text, target_lang="JA")
101
+ return result.text, None
102
+ except Exception as e:
103
+ print(f"DeepL API call failed: {e}. Falling back to Hugging Face model.")
104
+ # --- End ---
105
+
106
+ key = f"{source_lang}-{target_lang}"
107
+ try:
108
+ if key in translators:
109
+ return translators[key](text)[0]['translation_text'], None
110
 
111
+ elif source_lang != 'en' and target_lang != 'en':
112
+ if f"{source_lang}-en" not in translators or f"en-{target_lang}" not in translators:
113
+ return None, f"Bridge translation route not supported: {source_lang}-en or en-{target_lang}"
114
+
115
+ print(f"Performing bridge translation: {source_lang} -> en -> {target_lang}")
116
+ english_text = translators[f"{source_lang}-en"](text)[0]['translation_text']
117
+ return translators[f"en-{target_lang}"](english_text)[0]['translation_text'], None
118
+ else:
119
+ return None, f"Translation route not supported: {key}"
120
+ except Exception as e:
121
+ return None, str(e)
122
+
123
+ # 3. Create FastAPI App
124
+ app = FastAPI()
125
+
126
+ # 4. Define API Endpoints with FastAPI
127
+ @app.post("/api/asr")
128
+ async def api_asr(request: Request):
129
+ json_data = await request.json()
130
+ audio_data_uri = json_data.get('audio_data_uri')
131
+ if not audio_data_uri:
132
+ return JSONResponse(status_code=400, content={"error": "No audio_data_uri provided"})
133
+ try:
134
+ header, encoded = audio_data_uri.split(",", 1)
135
+ audio_bytes = base64.b64decode(encoded)
136
+ transcript, error = transcribe_audio(audio_bytes)
137
+ if error:
138
+ return JSONResponse(status_code=500, content={"error": f"ASR Error: {error}"})
139
+ return JSONResponse(status_code=200, content={"transcript": transcript})
140
+ except Exception as e:
141
+ return JSONResponse(status_code=500, content={"error": f"Server error: {e}"})
142
+
143
+ @app.post("/api/translate")
144
+ async def api_translate(request: Request):
145
+ json_data = await request.json()
146
+ text = json_data.get('text')
147
+ source_lang = json_data.get('source_lang')
148
+ target_lang = json_data.get('target_lang')
149
+ if not all([text, source_lang, target_lang]):
150
+ return JSONResponse(status_code=400, content={"error": "Missing parameters"})
151
 
152
+ translated_text, error = translate_text(text, source_lang, target_lang)
153
+ if error:
154
+ return JSONResponse(status_code=500, content={"error": error})
155
+ return JSONResponse(status_code=200, content={"translated_text": translated_text})
156
+
157
+ # 5. Create a simple Gradio UI for debugging (Optional)
158
+ def gradio_asr(audio_file):
159
+ if audio_file is None:
160
+ return ""
161
+ # Gradio provides a file object, read its bytes
162
+ audio_input, sample_rate = sf.read(audio_file.name)
163
+ # Process audio and generate token IDs
164
+ input_features = asr_processor(audio_input, sampling_rate=sample_rate, return_tensors="pt").input_features
165
+ predicted_ids = asr_model.generate(input_features)
166
+ # Decode the token IDs to text
167
+ transcript = asr_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
168
+ return transcript
169
+
170
+ gradio_ui = gr.Interface(fn=gradio_asr, inputs=gr.Audio(type="filepath"), outputs="text", title="ASR Debugger")
171
 
172
+ # 6. Mount Gradio app onto FastAPI
173
+ app = gr.mount_gradio_app(app, gradio_ui, path="/")
174
 
 
175
  if __name__ == "__main__":
176
+ uvicorn.run(app, host="0.0.0.0", port=7860)
pages/index/index.js CHANGED
@@ -1,235 +1,149 @@
 
 
 
 
 
 
 
 
 
 
1
 
2
  Page({
3
  data: {
4
- // Updated language codes to match Hugging Face API
5
  languages: {
6
  'zh': { name: '中文', flag: 'cn' },
7
  'en': { name: 'English', flag: 'us' },
8
  'ja': { name: '日本語', flag: 'jp' },
9
  'ko': { name: '한국어', flag: 'kr' }
10
  },
11
- langCodes: ['zh', 'en', 'ja', 'ko'],
12
  sourceLang: 'zh',
13
  targetLang: 'en',
14
  transcript: '',
15
  outputText: '',
16
  isRecording: false,
17
- sourceLanguages: [],
18
- targetLanguages: [],
19
- // Hugging Face Space API URL
20
- hfSpaceUrl: 'https://dazaozi-wechat-translator-app.hf.space' // REPLACE WITH YOUR ACTUAL SPACE URL
21
  },
22
 
23
  onLoad: function () {
24
- this.initializeLanguages();
25
  this.recorderManager = wx.getRecorderManager();
26
  this.initRecorderManager();
27
- },
28
-
29
- initializeLanguages: function () {
30
- const { langCodes, languages, sourceLang, targetLang } = this.data;
31
- const sourceLanguages = langCodes.map(code => ({
32
- langCode: code,
33
- name: languages[code].name,
34
- flag: languages[code].flag,
35
- selected: code === sourceLang
36
- }));
37
- const targetLanguages = langCodes.map(code => ({
38
- langCode: code,
39
- name: languages[code].name,
40
- flag: languages[code].flag,
41
- selected: code === targetLang
42
- }));
43
- this.setData({ sourceLanguages, targetLanguages });
44
- },
45
-
46
- selectSourceLanguage: function (e) {
47
- const newSourceLang = e.currentTarget.dataset.langCode;
48
- this.setData({ sourceLang: newSourceLang }, () => {
49
- this.initializeLanguages();
50
- if (this.data.transcript.trim() && this.data.transcript !== '正在聆听...' && this.data.transcript !== '未能识别到语音') {
51
- this.translate(this.data.transcript);
52
- }
53
- });
54
- },
55
 
56
- selectTargetLanguage: function (e) {
57
- const newTargetLang = e.currentTarget.dataset.langCode;
58
- this.setData({ targetLang: newTargetLang }, () => {
59
- this.initializeLanguages();
60
- if (this.data.transcript.trim() && this.data.transcript !== '正在聆听...' && this.data.transcript !== '未能识别到语音') {
61
- this.translate(this.data.transcript);
62
- }
63
  });
64
  },
65
 
 
 
 
66
  swapLanguages: function () {
67
- let { sourceLang, targetLang, transcript, outputText } = this.data;
68
-
69
- const tempLang = sourceLang;
70
- sourceLang = targetLang;
71
- targetLang = tempLang;
72
-
73
- const tempText = transcript;
74
- transcript = outputText;
75
- outputText = tempText;
76
-
77
- this.setData({ sourceLang, targetLang, transcript, outputText }, () => {
78
- this.initializeLanguages();
79
- if (this.data.transcript.trim() && this.data.transcript !== '正在聆听...' && this.data.transcript !== '未能识别到语音') {
80
- this.translate(this.data.transcript);
81
- }
82
- });
83
  },
84
 
 
85
  initRecorderManager: function () {
86
  this.recorderManager.onStart(() => {
87
- console.log('recorder start');
88
  this.setData({ isRecording: true, transcript: '正在聆听...', outputText: '' });
89
  });
90
 
91
  this.recorderManager.onStop((res) => {
92
- console.log('recorder stop', res);
93
  this.setData({ isRecording: false });
94
- const { tempFilePath } = res;
95
- if (tempFilePath) {
96
- this.uploadAudioForASR(tempFilePath);
97
  } else {
98
- console.error("onStop event triggered without a tempFilePath.");
99
- this.setData({ transcript: '录音文件创建失败' });
100
  }
101
  });
102
 
103
  this.recorderManager.onError((res) => {
104
- console.error('recorder error', res);
105
- this.setData({ isRecording: false, transcript: '语音识别出错' });
106
  });
107
  },
108
 
109
- startRecording: function () {
 
 
 
 
 
110
  wx.getSetting({
111
  success: (res) => {
112
  if (!res.authSetting['scope.record']) {
113
- wx.authorize({
114
- scope: 'scope.record',
115
- success: () => {
116
- this.recorderManager.start({ duration: 60000, sampleRate: 16000, numberOfChannels: 1, encodeBitRate: 96000, format: 'mp3' });
117
- },
118
- fail: () => {
119
- wx.showToast({ title: '请授权麦克风权限', icon: 'none' });
120
- }
121
- });
122
  } else {
123
- this.recorderManager.start({ duration: 60000, sampleRate: 16000, numberOfChannels: 1, encodeBitRate: 96000, format: 'mp3' });
124
  }
125
- }
 
126
  });
127
  },
128
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  stopRecording: function () {
130
- if (this.recorderManager) {
131
- this.recorderManager.stop();
132
- }
133
  },
134
 
 
135
  uploadAudioForASR: function (filePath) {
136
- console.log('Sending audio to Hugging Face ASR backend:', filePath);
137
  this.setData({ transcript: '正在识别...' });
138
-
139
- const fileSystemManager = wx.getFileSystemManager();
140
- fileSystemManager.readFile({
141
- filePath: filePath,
142
- // Read as ArrayBuffer
143
- success: (res) => {
144
- wx.request({
145
- url: `${this.data.hfSpaceUrl}/api/predict`,
146
- method: 'POST',
147
- header: {
148
- 'Content-Type': 'application/octet-stream' // Send as binary stream
149
- },
150
- data: res.data, // Send ArrayBuffer directly
151
- timeout: 60000, // 60 seconds timeout for ASR
152
- success: (res) => {
153
- try {
154
- const data = JSON.parse(res.data);
155
- // Gradio API returns result in data[0]
156
- if (res.statusCode === 200 && data.data && data.data.length > 0) {
157
- const transcript = data.data[0];
158
- if (transcript) {
159
- this.setData({ transcript: transcript });
160
- this.translate(transcript);
161
- } else {
162
- this.setData({ transcript: '未能识别到语音' });
163
- }
164
- } else {
165
- this.setData({ transcript: '语音识别失败' });
166
- console.error('ASR backend response error:', res);
167
- }
168
- } catch (e) {
169
- console.error('Failed to parse ASR response JSON:', e, res.data);
170
- this.setData({ transcript: '识别响应格式错误' });
171
- }
172
- },
173
- fail: (err) => {
174
- console.error('ASR request error:', err);
175
- if (err.errMsg && err.errMsg.includes('timeout')) {
176
- this.setData({ transcript: '识别超时,请重试' });
177
- } else {
178
- this.setData({ transcript: '语音识别出错' });
179
- }
180
  }
181
- });
182
- },
183
- fail: (err) => {
184
- console.error('Failed to read audio file:', err);
185
- this.setData({ transcript: '读取音频文件失败' });
186
- }
187
- });
188
  },
189
 
190
  translate: function (text) {
191
- if (!text || !this.data.sourceLang || !this.data.targetLang) return;
192
-
193
- const source = this.data.sourceLang;
194
- const target = this.data.targetLang;
195
-
196
- if (source === target) {
197
- this.setData({ outputText: text });
198
- return;
199
  }
200
-
201
  this.setData({ outputText: '正在翻译...' });
202
-
203
  wx.request({
204
- url: `${this.data.hfSpaceUrl}/api/predict`,
205
  method: 'POST',
206
- header: {
207
- 'content-type': 'application/json'
208
- },
209
- data: {
210
- // Gradio API expects data in a specific format for predict
211
- data: [text, source, target]
212
- },
213
- timeout: 30000, // 30 seconds timeout for translation
214
  success: (res) => {
215
- try {
216
- const data = res.data;
217
- if (res.statusCode === 200 && data.data && data.data.length > 0) {
218
- const translatedText = data.data[0];
219
- this.setData({ outputText: translatedText });
220
- } else {
221
- this.setData({ outputText: '翻译失败' });
222
- console.error('Translation backend response error:', res);
223
- }
224
- } catch (e) {
225
- console.error('Failed to parse Translation response JSON:', e, res.data);
226
- this.setData({ outputText: '翻译响应格式错误' });
227
  }
228
  },
229
- fail: (err) => {
230
- console.error('Translation request error:', err);
231
- this.setData({ outputText: '翻译出错' });
232
- }
233
  });
234
  }
235
  });
 
1
+ // FINAL VERSION: v29 - Based on the user's working original file
2
+
3
+ // Helper function to show detailed errors
4
+ function showDetailedError(title, content) {
5
+ wx.showModal({
6
+ title: title,
7
+ content: typeof content === 'object' ? JSON.stringify(content) : String(content),
8
+ showCancel: false
9
+ });
10
+ }
11
 
12
  Page({
13
  data: {
 
14
  languages: {
15
  'zh': { name: '中文', flag: 'cn' },
16
  'en': { name: 'English', flag: 'us' },
17
  'ja': { name: '日本語', flag: 'jp' },
18
  'ko': { name: '한국어', flag: 'kr' }
19
  },
 
20
  sourceLang: 'zh',
21
  targetLang: 'en',
22
  transcript: '',
23
  outputText: '',
24
  isRecording: false,
25
+ hfSpaceUrl: 'https://dazaozi-wechat-translator-app.hf.space',
 
 
 
26
  },
27
 
28
  onLoad: function () {
29
+ // Use the working pattern: attach recorderManager to `this`
30
  this.recorderManager = wx.getRecorderManager();
31
  this.initRecorderManager();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
+ // Use the improved, simpler language list setup
34
+ this.setData({
35
+ sourceLanguages: Object.keys(this.data.languages).map(key => ({ ...this.data.languages[key], langCode: key })),
36
+ targetLanguages: Object.keys(this.data.languages).map(key => ({ ...this.data.languages[key], langCode: key }))
 
 
 
37
  });
38
  },
39
 
40
+ // --- Language Selection & UI (Simplified) ---
41
+ selectSourceLanguage: function (e) { this.setData({ sourceLang: e.currentTarget.dataset.langCode }); },
42
+ selectTargetLanguage: function (e) { this.setData({ targetLang: e.currentTarget.dataset.langCode }); },
43
  swapLanguages: function () {
44
+ this.setData({ sourceLang: this.data.targetLang, targetLang: this.data.sourceLang, transcript: this.data.outputText, outputText: this.data.transcript });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  },
46
 
47
+ // --- Unified Native Recorder Initialization (Correct Pattern) ---
48
  initRecorderManager: function () {
49
  this.recorderManager.onStart(() => {
50
+ // Correct pattern: Set UI state *inside* the onStart callback
51
  this.setData({ isRecording: true, transcript: '正在聆听...', outputText: '' });
52
  });
53
 
54
  this.recorderManager.onStop((res) => {
 
55
  this.setData({ isRecording: false });
56
+ if (res.tempFilePath) {
57
+ this.uploadAudioForASR(res.tempFilePath);
 
58
  } else {
59
+ // This case might happen if recording is too short
60
+ this.setData({ transcript: '录音时间太短或无效' });
61
  }
62
  });
63
 
64
  this.recorderManager.onError((res) => {
65
+ this.setData({ isRecording: false });
66
+ showDetailedError('录音发生错误', res);
67
  });
68
  },
69
 
70
+ // --- Main Record Button Handler (with Permissions) ---
71
+ handleRecordToggle: function() {
72
+ if (this.data.isRecording) {
73
+ this.stopRecording();
74
+ return;
75
+ }
76
  wx.getSetting({
77
  success: (res) => {
78
  if (!res.authSetting['scope.record']) {
79
+ wx.authorize({ scope: 'scope.record', success: this.startRecording, fail: (err) => showDetailedError('授权失败', err) });
 
 
 
 
 
 
 
 
80
  } else {
81
+ this.startRecording();
82
  }
83
+ },
84
+ fail: (err) => showDetailedError('无法获取权限设置', err)
85
  });
86
  },
87
 
88
+ // --- Unified Start/Stop Recording ---
89
+ startRecording: function () {
90
+ const options = {
91
+ duration: 60000, // Max recording duration: 60s
92
+ sampleRate: 16000, // For ASR, 16kHz is the standard
93
+ numberOfChannels: 1, // Mono audio is sufficient
94
+ encodeBitRate: 48000, // 48kbps is a good balance for speech
95
+ format: 'mp3' // Use mp3 format
96
+ };
97
+ this.recorderManager.start(options);
98
+ },
99
+
100
  stopRecording: function () {
101
+ this.recorderManager.stop();
 
 
102
  },
103
 
104
+ // --- Unified Backend ASR & Translation Flow ---
105
  uploadAudioForASR: function (filePath) {
 
106
  this.setData({ transcript: '正在识别...' });
107
+ wx.getFileSystemManager().readFile({ filePath, encoding: 'base64', success: (res) => {
108
+ wx.request({
109
+ url: `${this.data.hfSpaceUrl}/api/asr`,
110
+ method: 'POST',
111
+ data: { "audio_data_uri": `data:audio/mp3;base64,${res.data}` },
112
+ timeout: 60000,
113
+ success: (asrRes) => {
114
+ if (asrRes.statusCode === 200 && asrRes.data.transcript) {
115
+ const transcript = asrRes.data.transcript;
116
+ this.setData({ transcript });
117
+ this.translate(transcript);
118
+ } else {
119
+ showDetailedError('语音识别失败', asrRes.data);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  }
121
+ },
122
+ fail: (err) => showDetailedError('识别请求失败', err)
123
+ });
124
+ }});
 
 
 
125
  },
126
 
127
  translate: function (text) {
128
+ if (!text) return;
129
+ const { sourceLang, targetLang } = this.data;
130
+ if (sourceLang === targetLang) {
131
+ return this.setData({ outputText: text });
 
 
 
 
132
  }
 
133
  this.setData({ outputText: '正在翻译...' });
 
134
  wx.request({
135
+ url: `${this.data.hfSpaceUrl}/api/translate`,
136
  method: 'POST',
137
+ data: { "text": text, "source_lang": sourceLang, "target_lang": targetLang },
138
+ timeout: 45000,
 
 
 
 
 
 
139
  success: (res) => {
140
+ if (res.statusCode === 200 && res.data.translated_text) {
141
+ this.setData({ outputText: res.data.translated_text });
142
+ } else {
143
+ showDetailedError('翻译失败', res.data);
 
 
 
 
 
 
 
 
144
  }
145
  },
146
+ fail: (err) => showDetailedError('翻译请求失败', err)
 
 
 
147
  });
148
  }
149
  });
requirements.txt CHANGED
@@ -2,3 +2,10 @@ transformers
2
  torch
3
  sentencepiece
4
  gradio
 
 
 
 
 
 
 
 
2
  torch
3
  sentencepiece
4
  gradio
5
+ soundfile
6
+ sacremoses
7
+ fastapi
8
+ uvicorn
9
+ deepl
10
+ python-dotenv
11
+ optimum[onnxruntime]