Spaces:
Sleeping
Sleeping
AKIRA
commited on
Commit
·
ad71343
1
Parent(s):
b20b423
feat: Optimize ASR speed with ONNX and audio compression
Browse files- app.py +152 -101
- pages/index/index.js +81 -167
- requirements.txt +7 -0
app.py
CHANGED
@@ -1,125 +1,176 @@
|
|
1 |
import gradio as gr
|
2 |
-
from transformers import pipeline
|
|
|
3 |
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
-
# 1. Load Models
|
6 |
-
print("Loading models...")
|
7 |
|
8 |
-
# ASR
|
9 |
-
|
10 |
-
|
11 |
-
model="openai/whisper-small",
|
12 |
-
torch_dtype=torch.float16, # Use float16 for faster inference
|
13 |
-
device="cpu" # Specify CPU device
|
14 |
-
)
|
15 |
|
16 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
translators = {
|
18 |
"en-zh": pipeline("translation", model="Helsinki-NLP/opus-mt-en-zh"),
|
19 |
-
"zh-en": pipeline("translation", model="
|
20 |
-
"en-ja": pipeline("translation", model="
|
21 |
"ja-en": pipeline("translation", model="Helsinki-NLP/opus-mt-ja-en"),
|
22 |
-
"en-ko": pipeline("translation", model="Helsinki-NLP/opus-mt-tc-big-en-ko"),
|
23 |
"ko-en": pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en"),
|
24 |
}
|
25 |
|
26 |
-
print("
|
27 |
-
|
28 |
-
# 2. Define Processing Functions
|
29 |
|
30 |
-
|
31 |
-
|
32 |
-
if audio_file is None:
|
33 |
-
print("Audio file is None.")
|
34 |
-
return ""
|
35 |
try:
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
-
#
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
print(f"Audio file path exists: {os.path.exists(audio_path)}")
|
51 |
-
print(f"Audio file size: {os.path.getsize(audio_path)} bytes")
|
52 |
-
|
53 |
-
# The pipeline expects a file path
|
54 |
-
text = asr_pipeline(audio_path)["text"]
|
55 |
-
print(f"ASR result: {text}")
|
56 |
|
57 |
-
#
|
58 |
-
|
59 |
-
os.remove(audio_path)
|
60 |
|
61 |
-
|
|
|
62 |
except Exception as e:
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
return
|
67 |
|
68 |
def translate_text(text, source_lang, target_lang):
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
)
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
|
117 |
-
|
118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
-
|
121 |
-
|
122 |
|
123 |
-
# 4. Launch the App
|
124 |
if __name__ == "__main__":
|
125 |
-
|
|
|
1 |
import gradio as gr
|
2 |
+
from transformers import pipeline, AutoProcessor
|
3 |
+
from optimum.onnxruntime import ORTModelForSpeechSeq2Seq
|
4 |
import torch
|
5 |
+
import os
|
6 |
+
import base64
|
7 |
+
import tempfile
|
8 |
+
from fastapi import FastAPI, Request
|
9 |
+
from fastapi.responses import JSONResponse
|
10 |
+
import uvicorn
|
11 |
+
import deepl
|
12 |
+
from dotenv import load_dotenv
|
13 |
+
import soundfile as sf
|
14 |
+
|
15 |
+
# --- Load environment variables and initialize DeepL ---
|
16 |
+
load_dotenv()
|
17 |
+
|
18 |
+
DEEPL_AUTH_KEY = os.getenv("DEEPL_AUTH_KEY")
|
19 |
+
deepl_translator = None
|
20 |
+
if DEEPL_AUTH_KEY:
|
21 |
+
try:
|
22 |
+
deepl_translator = deepl.Translator(DEEPL_AUTH_KEY)
|
23 |
+
print("DeepL translator initialized successfully.")
|
24 |
+
except Exception as e:
|
25 |
+
print(f"Error initializing DeepL translator: {e}")
|
26 |
+
print("DeepL will be unavailable.")
|
27 |
+
else:
|
28 |
+
print("DEEPL_AUTH_KEY not found. DeepL will be unavailable.")
|
29 |
+
# --- End ---
|
30 |
+
|
31 |
|
32 |
+
# 1. Load Models
|
33 |
+
print("Loading all models... This will take some time on startup.")
|
34 |
|
35 |
+
# ASR Model - Using a CPU-optimized ONNX model for speed
|
36 |
+
print("Loading optimized ASR model...")
|
37 |
+
asr_model_id = "openai/whisper-base"
|
|
|
|
|
|
|
|
|
38 |
|
39 |
+
# Load the model and processor using Optimum for ONNX runtime acceleration
|
40 |
+
asr_model = ORTModelForSpeechSeq2Seq.from_pretrained(asr_model_id, provider="CPUExecutionProvider")
|
41 |
+
asr_processor = AutoProcessor.from_pretrained(asr_model_id)
|
42 |
+
print("Optimized ASR model loaded.")
|
43 |
+
|
44 |
+
|
45 |
+
# Translation Pipelines - Reverting to the 6 core, absolutely reliable models
|
46 |
translators = {
|
47 |
"en-zh": pipeline("translation", model="Helsinki-NLP/opus-mt-en-zh"),
|
48 |
+
"zh-en": pipeline("translation", model="Varine/opus-mt-zh-en-model"),
|
49 |
+
"en-ja": pipeline("translation", model="staka/fugumt-en-ja"),
|
50 |
"ja-en": pipeline("translation", model="Helsinki-NLP/opus-mt-ja-en"),
|
51 |
+
"en-ko": pipeline("translation", model="Helsinki-NLP/opus-mt-tc-big-en-ko"),
|
52 |
"ko-en": pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en"),
|
53 |
}
|
54 |
|
55 |
+
print("All models loaded successfully.")
|
|
|
|
|
56 |
|
57 |
+
# 2. Define Core Logic Functions
|
58 |
+
def transcribe_audio(audio_bytes):
|
|
|
|
|
|
|
59 |
try:
|
60 |
+
# Use a temporary file to handle the audio bytes
|
61 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
|
62 |
+
tmp_file.write(audio_bytes)
|
63 |
+
audio_path = tmp_file.name
|
64 |
+
|
65 |
+
# Read the audio file and process it
|
66 |
+
audio_input, sample_rate = sf.read(audio_path)
|
67 |
|
68 |
+
# Ensure the audio is in the correct format (mono, 16kHz)
|
69 |
+
if audio_input.ndim > 1:
|
70 |
+
audio_input = audio_input.mean(axis=1) # to mono
|
71 |
+
if sample_rate != 16000:
|
72 |
+
# This is a placeholder for resampling. For now, we assume frontend sends 16kHz.
|
73 |
+
pass
|
74 |
+
|
75 |
+
# Process audio and generate token IDs
|
76 |
+
input_features = asr_processor(audio_input, sampling_rate=16000, return_tensors="pt").input_features
|
77 |
+
predicted_ids = asr_model.generate(input_features)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
+
# Decode the token IDs to text
|
80 |
+
text = asr_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
|
|
81 |
|
82 |
+
os.remove(audio_path)
|
83 |
+
return text, None
|
84 |
except Exception as e:
|
85 |
+
# Clean up the temp file in case of an error
|
86 |
+
if 'audio_path' in locals() and os.path.exists(audio_path):
|
87 |
+
os.remove(audio_path)
|
88 |
+
return None, str(e)
|
89 |
|
90 |
def translate_text(text, source_lang, target_lang):
|
91 |
+
if not text or not source_lang or not target_lang:
|
92 |
+
return "", None
|
93 |
+
if source_lang == target_lang:
|
94 |
+
return text, None
|
95 |
+
|
96 |
+
# --- DeepL Hybrid Logic ---
|
97 |
+
if source_lang == 'zh' and target_lang == 'ja' and deepl_translator:
|
98 |
+
print("Attempting translation with DeepL for zh -> ja")
|
99 |
+
try:
|
100 |
+
result = deepl_translator.translate_text(text, target_lang="JA")
|
101 |
+
return result.text, None
|
102 |
+
except Exception as e:
|
103 |
+
print(f"DeepL API call failed: {e}. Falling back to Hugging Face model.")
|
104 |
+
# --- End ---
|
105 |
+
|
106 |
+
key = f"{source_lang}-{target_lang}"
|
107 |
+
try:
|
108 |
+
if key in translators:
|
109 |
+
return translators[key](text)[0]['translation_text'], None
|
110 |
|
111 |
+
elif source_lang != 'en' and target_lang != 'en':
|
112 |
+
if f"{source_lang}-en" not in translators or f"en-{target_lang}" not in translators:
|
113 |
+
return None, f"Bridge translation route not supported: {source_lang}-en or en-{target_lang}"
|
114 |
+
|
115 |
+
print(f"Performing bridge translation: {source_lang} -> en -> {target_lang}")
|
116 |
+
english_text = translators[f"{source_lang}-en"](text)[0]['translation_text']
|
117 |
+
return translators[f"en-{target_lang}"](english_text)[0]['translation_text'], None
|
118 |
+
else:
|
119 |
+
return None, f"Translation route not supported: {key}"
|
120 |
+
except Exception as e:
|
121 |
+
return None, str(e)
|
122 |
+
|
123 |
+
# 3. Create FastAPI App
|
124 |
+
app = FastAPI()
|
125 |
+
|
126 |
+
# 4. Define API Endpoints with FastAPI
|
127 |
+
@app.post("/api/asr")
|
128 |
+
async def api_asr(request: Request):
|
129 |
+
json_data = await request.json()
|
130 |
+
audio_data_uri = json_data.get('audio_data_uri')
|
131 |
+
if not audio_data_uri:
|
132 |
+
return JSONResponse(status_code=400, content={"error": "No audio_data_uri provided"})
|
133 |
+
try:
|
134 |
+
header, encoded = audio_data_uri.split(",", 1)
|
135 |
+
audio_bytes = base64.b64decode(encoded)
|
136 |
+
transcript, error = transcribe_audio(audio_bytes)
|
137 |
+
if error:
|
138 |
+
return JSONResponse(status_code=500, content={"error": f"ASR Error: {error}"})
|
139 |
+
return JSONResponse(status_code=200, content={"transcript": transcript})
|
140 |
+
except Exception as e:
|
141 |
+
return JSONResponse(status_code=500, content={"error": f"Server error: {e}"})
|
142 |
+
|
143 |
+
@app.post("/api/translate")
|
144 |
+
async def api_translate(request: Request):
|
145 |
+
json_data = await request.json()
|
146 |
+
text = json_data.get('text')
|
147 |
+
source_lang = json_data.get('source_lang')
|
148 |
+
target_lang = json_data.get('target_lang')
|
149 |
+
if not all([text, source_lang, target_lang]):
|
150 |
+
return JSONResponse(status_code=400, content={"error": "Missing parameters"})
|
151 |
|
152 |
+
translated_text, error = translate_text(text, source_lang, target_lang)
|
153 |
+
if error:
|
154 |
+
return JSONResponse(status_code=500, content={"error": error})
|
155 |
+
return JSONResponse(status_code=200, content={"translated_text": translated_text})
|
156 |
+
|
157 |
+
# 5. Create a simple Gradio UI for debugging (Optional)
|
158 |
+
def gradio_asr(audio_file):
|
159 |
+
if audio_file is None:
|
160 |
+
return ""
|
161 |
+
# Gradio provides a file object, read its bytes
|
162 |
+
audio_input, sample_rate = sf.read(audio_file.name)
|
163 |
+
# Process audio and generate token IDs
|
164 |
+
input_features = asr_processor(audio_input, sampling_rate=sample_rate, return_tensors="pt").input_features
|
165 |
+
predicted_ids = asr_model.generate(input_features)
|
166 |
+
# Decode the token IDs to text
|
167 |
+
transcript = asr_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
168 |
+
return transcript
|
169 |
+
|
170 |
+
gradio_ui = gr.Interface(fn=gradio_asr, inputs=gr.Audio(type="filepath"), outputs="text", title="ASR Debugger")
|
171 |
|
172 |
+
# 6. Mount Gradio app onto FastAPI
|
173 |
+
app = gr.mount_gradio_app(app, gradio_ui, path="/")
|
174 |
|
|
|
175 |
if __name__ == "__main__":
|
176 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
pages/index/index.js
CHANGED
@@ -1,235 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
|
2 |
Page({
|
3 |
data: {
|
4 |
-
// Updated language codes to match Hugging Face API
|
5 |
languages: {
|
6 |
'zh': { name: '中文', flag: 'cn' },
|
7 |
'en': { name: 'English', flag: 'us' },
|
8 |
'ja': { name: '日本語', flag: 'jp' },
|
9 |
'ko': { name: '한국어', flag: 'kr' }
|
10 |
},
|
11 |
-
langCodes: ['zh', 'en', 'ja', 'ko'],
|
12 |
sourceLang: 'zh',
|
13 |
targetLang: 'en',
|
14 |
transcript: '',
|
15 |
outputText: '',
|
16 |
isRecording: false,
|
17 |
-
|
18 |
-
targetLanguages: [],
|
19 |
-
// Hugging Face Space API URL
|
20 |
-
hfSpaceUrl: 'https://dazaozi-wechat-translator-app.hf.space' // REPLACE WITH YOUR ACTUAL SPACE URL
|
21 |
},
|
22 |
|
23 |
onLoad: function () {
|
24 |
-
this
|
25 |
this.recorderManager = wx.getRecorderManager();
|
26 |
this.initRecorderManager();
|
27 |
-
},
|
28 |
-
|
29 |
-
initializeLanguages: function () {
|
30 |
-
const { langCodes, languages, sourceLang, targetLang } = this.data;
|
31 |
-
const sourceLanguages = langCodes.map(code => ({
|
32 |
-
langCode: code,
|
33 |
-
name: languages[code].name,
|
34 |
-
flag: languages[code].flag,
|
35 |
-
selected: code === sourceLang
|
36 |
-
}));
|
37 |
-
const targetLanguages = langCodes.map(code => ({
|
38 |
-
langCode: code,
|
39 |
-
name: languages[code].name,
|
40 |
-
flag: languages[code].flag,
|
41 |
-
selected: code === targetLang
|
42 |
-
}));
|
43 |
-
this.setData({ sourceLanguages, targetLanguages });
|
44 |
-
},
|
45 |
-
|
46 |
-
selectSourceLanguage: function (e) {
|
47 |
-
const newSourceLang = e.currentTarget.dataset.langCode;
|
48 |
-
this.setData({ sourceLang: newSourceLang }, () => {
|
49 |
-
this.initializeLanguages();
|
50 |
-
if (this.data.transcript.trim() && this.data.transcript !== '正在聆听...' && this.data.transcript !== '未能识别到语音') {
|
51 |
-
this.translate(this.data.transcript);
|
52 |
-
}
|
53 |
-
});
|
54 |
-
},
|
55 |
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
this.
|
60 |
-
if (this.data.transcript.trim() && this.data.transcript !== '正在聆听...' && this.data.transcript !== '未能识别到语音') {
|
61 |
-
this.translate(this.data.transcript);
|
62 |
-
}
|
63 |
});
|
64 |
},
|
65 |
|
|
|
|
|
|
|
66 |
swapLanguages: function () {
|
67 |
-
|
68 |
-
|
69 |
-
const tempLang = sourceLang;
|
70 |
-
sourceLang = targetLang;
|
71 |
-
targetLang = tempLang;
|
72 |
-
|
73 |
-
const tempText = transcript;
|
74 |
-
transcript = outputText;
|
75 |
-
outputText = tempText;
|
76 |
-
|
77 |
-
this.setData({ sourceLang, targetLang, transcript, outputText }, () => {
|
78 |
-
this.initializeLanguages();
|
79 |
-
if (this.data.transcript.trim() && this.data.transcript !== '正在聆听...' && this.data.transcript !== '未能识别到语音') {
|
80 |
-
this.translate(this.data.transcript);
|
81 |
-
}
|
82 |
-
});
|
83 |
},
|
84 |
|
|
|
85 |
initRecorderManager: function () {
|
86 |
this.recorderManager.onStart(() => {
|
87 |
-
|
88 |
this.setData({ isRecording: true, transcript: '正在聆听...', outputText: '' });
|
89 |
});
|
90 |
|
91 |
this.recorderManager.onStop((res) => {
|
92 |
-
console.log('recorder stop', res);
|
93 |
this.setData({ isRecording: false });
|
94 |
-
|
95 |
-
|
96 |
-
this.uploadAudioForASR(tempFilePath);
|
97 |
} else {
|
98 |
-
|
99 |
-
this.setData({ transcript: '
|
100 |
}
|
101 |
});
|
102 |
|
103 |
this.recorderManager.onError((res) => {
|
104 |
-
|
105 |
-
|
106 |
});
|
107 |
},
|
108 |
|
109 |
-
|
|
|
|
|
|
|
|
|
|
|
110 |
wx.getSetting({
|
111 |
success: (res) => {
|
112 |
if (!res.authSetting['scope.record']) {
|
113 |
-
wx.authorize({
|
114 |
-
scope: 'scope.record',
|
115 |
-
success: () => {
|
116 |
-
this.recorderManager.start({ duration: 60000, sampleRate: 16000, numberOfChannels: 1, encodeBitRate: 96000, format: 'mp3' });
|
117 |
-
},
|
118 |
-
fail: () => {
|
119 |
-
wx.showToast({ title: '请授权麦克风权限', icon: 'none' });
|
120 |
-
}
|
121 |
-
});
|
122 |
} else {
|
123 |
-
this.
|
124 |
}
|
125 |
-
}
|
|
|
126 |
});
|
127 |
},
|
128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
stopRecording: function () {
|
130 |
-
|
131 |
-
this.recorderManager.stop();
|
132 |
-
}
|
133 |
},
|
134 |
|
|
|
135 |
uploadAudioForASR: function (filePath) {
|
136 |
-
console.log('Sending audio to Hugging Face ASR backend:', filePath);
|
137 |
this.setData({ transcript: '正在识别...' });
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
}
|
150 |
-
|
151 |
-
timeout: 60000, // 60 seconds timeout for ASR
|
152 |
-
success: (res) => {
|
153 |
-
try {
|
154 |
-
const data = JSON.parse(res.data);
|
155 |
-
// Gradio API returns result in data[0]
|
156 |
-
if (res.statusCode === 200 && data.data && data.data.length > 0) {
|
157 |
-
const transcript = data.data[0];
|
158 |
-
if (transcript) {
|
159 |
-
this.setData({ transcript: transcript });
|
160 |
-
this.translate(transcript);
|
161 |
-
} else {
|
162 |
-
this.setData({ transcript: '未能识别到语音' });
|
163 |
-
}
|
164 |
-
} else {
|
165 |
-
this.setData({ transcript: '语音识别失败' });
|
166 |
-
console.error('ASR backend response error:', res);
|
167 |
-
}
|
168 |
-
} catch (e) {
|
169 |
-
console.error('Failed to parse ASR response JSON:', e, res.data);
|
170 |
-
this.setData({ transcript: '识别响应格式错误' });
|
171 |
-
}
|
172 |
-
},
|
173 |
-
fail: (err) => {
|
174 |
-
console.error('ASR request error:', err);
|
175 |
-
if (err.errMsg && err.errMsg.includes('timeout')) {
|
176 |
-
this.setData({ transcript: '识别超时,请重试' });
|
177 |
-
} else {
|
178 |
-
this.setData({ transcript: '语音识别出错' });
|
179 |
-
}
|
180 |
}
|
181 |
-
}
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
this.setData({ transcript: '读取音频文件失败' });
|
186 |
-
}
|
187 |
-
});
|
188 |
},
|
189 |
|
190 |
translate: function (text) {
|
191 |
-
if (!text
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
if (source === target) {
|
197 |
-
this.setData({ outputText: text });
|
198 |
-
return;
|
199 |
}
|
200 |
-
|
201 |
this.setData({ outputText: '正在翻译...' });
|
202 |
-
|
203 |
wx.request({
|
204 |
-
url: `${this.data.hfSpaceUrl}/api/
|
205 |
method: 'POST',
|
206 |
-
|
207 |
-
|
208 |
-
},
|
209 |
-
data: {
|
210 |
-
// Gradio API expects data in a specific format for predict
|
211 |
-
data: [text, source, target]
|
212 |
-
},
|
213 |
-
timeout: 30000, // 30 seconds timeout for translation
|
214 |
success: (res) => {
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
this.setData({ outputText: translatedText });
|
220 |
-
} else {
|
221 |
-
this.setData({ outputText: '翻译失败' });
|
222 |
-
console.error('Translation backend response error:', res);
|
223 |
-
}
|
224 |
-
} catch (e) {
|
225 |
-
console.error('Failed to parse Translation response JSON:', e, res.data);
|
226 |
-
this.setData({ outputText: '翻译响应格式错误' });
|
227 |
}
|
228 |
},
|
229 |
-
fail: (err) =>
|
230 |
-
console.error('Translation request error:', err);
|
231 |
-
this.setData({ outputText: '翻译出错' });
|
232 |
-
}
|
233 |
});
|
234 |
}
|
235 |
});
|
|
|
1 |
+
// FINAL VERSION: v29 - Based on the user's working original file
|
2 |
+
|
3 |
+
// Helper function to show detailed errors
|
4 |
+
function showDetailedError(title, content) {
|
5 |
+
wx.showModal({
|
6 |
+
title: title,
|
7 |
+
content: typeof content === 'object' ? JSON.stringify(content) : String(content),
|
8 |
+
showCancel: false
|
9 |
+
});
|
10 |
+
}
|
11 |
|
12 |
Page({
|
13 |
data: {
|
|
|
14 |
languages: {
|
15 |
'zh': { name: '中文', flag: 'cn' },
|
16 |
'en': { name: 'English', flag: 'us' },
|
17 |
'ja': { name: '日本語', flag: 'jp' },
|
18 |
'ko': { name: '한국어', flag: 'kr' }
|
19 |
},
|
|
|
20 |
sourceLang: 'zh',
|
21 |
targetLang: 'en',
|
22 |
transcript: '',
|
23 |
outputText: '',
|
24 |
isRecording: false,
|
25 |
+
hfSpaceUrl: 'https://dazaozi-wechat-translator-app.hf.space',
|
|
|
|
|
|
|
26 |
},
|
27 |
|
28 |
onLoad: function () {
|
29 |
+
// Use the working pattern: attach recorderManager to `this`
|
30 |
this.recorderManager = wx.getRecorderManager();
|
31 |
this.initRecorderManager();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
+
// Use the improved, simpler language list setup
|
34 |
+
this.setData({
|
35 |
+
sourceLanguages: Object.keys(this.data.languages).map(key => ({ ...this.data.languages[key], langCode: key })),
|
36 |
+
targetLanguages: Object.keys(this.data.languages).map(key => ({ ...this.data.languages[key], langCode: key }))
|
|
|
|
|
|
|
37 |
});
|
38 |
},
|
39 |
|
40 |
+
// --- Language Selection & UI (Simplified) ---
|
41 |
+
selectSourceLanguage: function (e) { this.setData({ sourceLang: e.currentTarget.dataset.langCode }); },
|
42 |
+
selectTargetLanguage: function (e) { this.setData({ targetLang: e.currentTarget.dataset.langCode }); },
|
43 |
swapLanguages: function () {
|
44 |
+
this.setData({ sourceLang: this.data.targetLang, targetLang: this.data.sourceLang, transcript: this.data.outputText, outputText: this.data.transcript });
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
},
|
46 |
|
47 |
+
// --- Unified Native Recorder Initialization (Correct Pattern) ---
|
48 |
initRecorderManager: function () {
|
49 |
this.recorderManager.onStart(() => {
|
50 |
+
// Correct pattern: Set UI state *inside* the onStart callback
|
51 |
this.setData({ isRecording: true, transcript: '正在聆听...', outputText: '' });
|
52 |
});
|
53 |
|
54 |
this.recorderManager.onStop((res) => {
|
|
|
55 |
this.setData({ isRecording: false });
|
56 |
+
if (res.tempFilePath) {
|
57 |
+
this.uploadAudioForASR(res.tempFilePath);
|
|
|
58 |
} else {
|
59 |
+
// This case might happen if recording is too short
|
60 |
+
this.setData({ transcript: '录音时间太短或无效' });
|
61 |
}
|
62 |
});
|
63 |
|
64 |
this.recorderManager.onError((res) => {
|
65 |
+
this.setData({ isRecording: false });
|
66 |
+
showDetailedError('录音发生错误', res);
|
67 |
});
|
68 |
},
|
69 |
|
70 |
+
// --- Main Record Button Handler (with Permissions) ---
|
71 |
+
handleRecordToggle: function() {
|
72 |
+
if (this.data.isRecording) {
|
73 |
+
this.stopRecording();
|
74 |
+
return;
|
75 |
+
}
|
76 |
wx.getSetting({
|
77 |
success: (res) => {
|
78 |
if (!res.authSetting['scope.record']) {
|
79 |
+
wx.authorize({ scope: 'scope.record', success: this.startRecording, fail: (err) => showDetailedError('授权失败', err) });
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
} else {
|
81 |
+
this.startRecording();
|
82 |
}
|
83 |
+
},
|
84 |
+
fail: (err) => showDetailedError('无法获取权限设置', err)
|
85 |
});
|
86 |
},
|
87 |
|
88 |
+
// --- Unified Start/Stop Recording ---
|
89 |
+
startRecording: function () {
|
90 |
+
const options = {
|
91 |
+
duration: 60000, // Max recording duration: 60s
|
92 |
+
sampleRate: 16000, // For ASR, 16kHz is the standard
|
93 |
+
numberOfChannels: 1, // Mono audio is sufficient
|
94 |
+
encodeBitRate: 48000, // 48kbps is a good balance for speech
|
95 |
+
format: 'mp3' // Use mp3 format
|
96 |
+
};
|
97 |
+
this.recorderManager.start(options);
|
98 |
+
},
|
99 |
+
|
100 |
stopRecording: function () {
|
101 |
+
this.recorderManager.stop();
|
|
|
|
|
102 |
},
|
103 |
|
104 |
+
// --- Unified Backend ASR & Translation Flow ---
|
105 |
uploadAudioForASR: function (filePath) {
|
|
|
106 |
this.setData({ transcript: '正在识别...' });
|
107 |
+
wx.getFileSystemManager().readFile({ filePath, encoding: 'base64', success: (res) => {
|
108 |
+
wx.request({
|
109 |
+
url: `${this.data.hfSpaceUrl}/api/asr`,
|
110 |
+
method: 'POST',
|
111 |
+
data: { "audio_data_uri": `data:audio/mp3;base64,${res.data}` },
|
112 |
+
timeout: 60000,
|
113 |
+
success: (asrRes) => {
|
114 |
+
if (asrRes.statusCode === 200 && asrRes.data.transcript) {
|
115 |
+
const transcript = asrRes.data.transcript;
|
116 |
+
this.setData({ transcript });
|
117 |
+
this.translate(transcript);
|
118 |
+
} else {
|
119 |
+
showDetailedError('语音识别失败', asrRes.data);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
}
|
121 |
+
},
|
122 |
+
fail: (err) => showDetailedError('识别请求失败', err)
|
123 |
+
});
|
124 |
+
}});
|
|
|
|
|
|
|
125 |
},
|
126 |
|
127 |
translate: function (text) {
|
128 |
+
if (!text) return;
|
129 |
+
const { sourceLang, targetLang } = this.data;
|
130 |
+
if (sourceLang === targetLang) {
|
131 |
+
return this.setData({ outputText: text });
|
|
|
|
|
|
|
|
|
132 |
}
|
|
|
133 |
this.setData({ outputText: '正在翻译...' });
|
|
|
134 |
wx.request({
|
135 |
+
url: `${this.data.hfSpaceUrl}/api/translate`,
|
136 |
method: 'POST',
|
137 |
+
data: { "text": text, "source_lang": sourceLang, "target_lang": targetLang },
|
138 |
+
timeout: 45000,
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
success: (res) => {
|
140 |
+
if (res.statusCode === 200 && res.data.translated_text) {
|
141 |
+
this.setData({ outputText: res.data.translated_text });
|
142 |
+
} else {
|
143 |
+
showDetailedError('翻译失败', res.data);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
}
|
145 |
},
|
146 |
+
fail: (err) => showDetailedError('翻译请求失败', err)
|
|
|
|
|
|
|
147 |
});
|
148 |
}
|
149 |
});
|
requirements.txt
CHANGED
@@ -2,3 +2,10 @@ transformers
|
|
2 |
torch
|
3 |
sentencepiece
|
4 |
gradio
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
torch
|
3 |
sentencepiece
|
4 |
gradio
|
5 |
+
soundfile
|
6 |
+
sacremoses
|
7 |
+
fastapi
|
8 |
+
uvicorn
|
9 |
+
deepl
|
10 |
+
python-dotenv
|
11 |
+
optimum[onnxruntime]
|