Spaces:
Sleeping
Sleeping
import flask | |
from flask import request, jsonify | |
import logging | |
import numpy as np | |
import ffmpeg | |
from faster_whisper import WhisperModel | |
import os | |
# --- Configuration from Environment Variables --- | |
MODEL_NAME = os.getenv("MODEL_NAME", "small") | |
CPU_THREADS = int(os.getenv("CPU_THREADS", "2")) | |
# --- Setup --- | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
# --- Load Model with Performance Optimizations --- | |
model = None | |
try: | |
logging.info(f"Loading faster-whisper model '{MODEL_NAME}' with {CPU_THREADS} threads...") | |
# 回退到兼容性最好的 int8 计算模式 | |
model = WhisperModel(MODEL_NAME, device="cpu", compute_type="int8", cpu_threads=CPU_THREADS, num_workers=1) | |
logging.info(f"Whisper '{MODEL_NAME}' model loaded successfully.") | |
except Exception as e: | |
logging.error(f"Fatal error: Could not load model. {e}") | |
exit(1) | |
# --- Initialize Flask App --- | |
app = flask.Flask(__name__) | |
def load_audio_from_buffer(buffer, sr=16000): | |
"""Decodes audio from an in-memory buffer using ffmpeg.""" | |
try: | |
out, _ = ( | |
ffmpeg.input('pipe:', threads=0) | |
.output('pipe:', format='s16le', acodec='pcm_s16le', ac=1, ar=sr) | |
.run(input=buffer, capture_stdout=True, capture_stderr=True) | |
) | |
except ffmpeg.Error as e: | |
raise RuntimeError(f"Failed to load audio with ffmpeg: {e.stderr.decode()}") from e | |
return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0 | |
def transcribe_audio(): | |
if 'audio' not in request.files: | |
return jsonify({"error": "No audio file part in the request"}), 400 | |
file = request.files['audio'] | |
if file.filename == '': | |
return jsonify({"error": "No selected file"}), 400 | |
try: | |
audio_buffer = file.read() | |
audio_np = load_audio_from_buffer(audio_buffer) | |
source_lang_full = request.form.get('sourceLang') | |
lang_code = None | |
if source_lang_full: | |
lang_code = source_lang_full.split('-')[0] | |
logging.info(f"Client requested language: {lang_code}") | |
# 保留有���的性能优化: 减小 beam_size 并调整 VAD | |
segments, info = model.transcribe( | |
audio_np, | |
language=lang_code, | |
beam_size=2, # 从默认的5减小到2,显著降低计算量 | |
vad_filter=True, | |
vad_parameters=dict(min_silence_duration_ms=700) # 稍微增加静音检测时长 | |
) | |
transcript = "".join(segment.text for segment in segments).strip() | |
logging.info(f"Detected language '{info.language}' with probability {info.language_probability:.2f}") | |
logging.info(f"Transcription result: '{transcript}'") | |
return jsonify({"transcript": transcript}) | |
except Exception as e: | |
logging.error(f"Error during transcription: {e}", exc_info=True) | |
return jsonify({"error": f"Transcription failed: {str(e)}"}), 500 | |