wechat-translator-app / asr_server.py
AKIRA
Finalize all local changes
b3b0b53
import flask
from flask import request, jsonify
import logging
import numpy as np
import ffmpeg
from faster_whisper import WhisperModel
import os
# --- Configuration from Environment Variables ---
MODEL_NAME = os.getenv("MODEL_NAME", "small")
CPU_THREADS = int(os.getenv("CPU_THREADS", "2"))
# --- Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# --- Load Model with Performance Optimizations ---
model = None
try:
logging.info(f"Loading faster-whisper model '{MODEL_NAME}' with {CPU_THREADS} threads...")
# 回退到兼容性最好的 int8 计算模式
model = WhisperModel(MODEL_NAME, device="cpu", compute_type="int8", cpu_threads=CPU_THREADS, num_workers=1)
logging.info(f"Whisper '{MODEL_NAME}' model loaded successfully.")
except Exception as e:
logging.error(f"Fatal error: Could not load model. {e}")
exit(1)
# --- Initialize Flask App ---
app = flask.Flask(__name__)
def load_audio_from_buffer(buffer, sr=16000):
"""Decodes audio from an in-memory buffer using ffmpeg."""
try:
out, _ = (
ffmpeg.input('pipe:', threads=0)
.output('pipe:', format='s16le', acodec='pcm_s16le', ac=1, ar=sr)
.run(input=buffer, capture_stdout=True, capture_stderr=True)
)
except ffmpeg.Error as e:
raise RuntimeError(f"Failed to load audio with ffmpeg: {e.stderr.decode()}") from e
return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
@app.route('/transcribe', methods=['POST'])
def transcribe_audio():
if 'audio' not in request.files:
return jsonify({"error": "No audio file part in the request"}), 400
file = request.files['audio']
if file.filename == '':
return jsonify({"error": "No selected file"}), 400
try:
audio_buffer = file.read()
audio_np = load_audio_from_buffer(audio_buffer)
source_lang_full = request.form.get('sourceLang')
lang_code = None
if source_lang_full:
lang_code = source_lang_full.split('-')[0]
logging.info(f"Client requested language: {lang_code}")
# 保留有���的性能优化: 减小 beam_size 并调整 VAD
segments, info = model.transcribe(
audio_np,
language=lang_code,
beam_size=2, # 从默认的5减小到2,显著降低计算量
vad_filter=True,
vad_parameters=dict(min_silence_duration_ms=700) # 稍微增加静音检测时长
)
transcript = "".join(segment.text for segment in segments).strip()
logging.info(f"Detected language '{info.language}' with probability {info.language_probability:.2f}")
logging.info(f"Transcription result: '{transcript}'")
return jsonify({"transcript": transcript})
except Exception as e:
logging.error(f"Error during transcription: {e}", exc_info=True)
return jsonify({"error": f"Transcription failed: {str(e)}"}), 500