import flask from flask import request, jsonify import logging import numpy as np import ffmpeg from faster_whisper import WhisperModel import os # --- Configuration from Environment Variables --- MODEL_NAME = os.getenv("MODEL_NAME", "small") CPU_THREADS = int(os.getenv("CPU_THREADS", "2")) # --- Setup --- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # --- Load Model with Performance Optimizations --- model = None try: logging.info(f"Loading faster-whisper model '{MODEL_NAME}' with {CPU_THREADS} threads...") # 回退到兼容性最好的 int8 计算模式 model = WhisperModel(MODEL_NAME, device="cpu", compute_type="int8", cpu_threads=CPU_THREADS, num_workers=1) logging.info(f"Whisper '{MODEL_NAME}' model loaded successfully.") except Exception as e: logging.error(f"Fatal error: Could not load model. {e}") exit(1) # --- Initialize Flask App --- app = flask.Flask(__name__) def load_audio_from_buffer(buffer, sr=16000): """Decodes audio from an in-memory buffer using ffmpeg.""" try: out, _ = ( ffmpeg.input('pipe:', threads=0) .output('pipe:', format='s16le', acodec='pcm_s16le', ac=1, ar=sr) .run(input=buffer, capture_stdout=True, capture_stderr=True) ) except ffmpeg.Error as e: raise RuntimeError(f"Failed to load audio with ffmpeg: {e.stderr.decode()}") from e return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0 @app.route('/transcribe', methods=['POST']) def transcribe_audio(): if 'audio' not in request.files: return jsonify({"error": "No audio file part in the request"}), 400 file = request.files['audio'] if file.filename == '': return jsonify({"error": "No selected file"}), 400 try: audio_buffer = file.read() audio_np = load_audio_from_buffer(audio_buffer) source_lang_full = request.form.get('sourceLang') lang_code = None if source_lang_full: lang_code = source_lang_full.split('-')[0] logging.info(f"Client requested language: {lang_code}") # 保留有���的性能优化: 减小 beam_size 并调整 VAD segments, info = model.transcribe( audio_np, language=lang_code, beam_size=2, # 从默认的5减小到2,显著降低计算量 vad_filter=True, vad_parameters=dict(min_silence_duration_ms=700) # 稍微增加静音检测时长 ) transcript = "".join(segment.text for segment in segments).strip() logging.info(f"Detected language '{info.language}' with probability {info.language_probability:.2f}") logging.info(f"Transcription result: '{transcript}'") return jsonify({"transcript": transcript}) except Exception as e: logging.error(f"Error during transcription: {e}", exc_info=True) return jsonify({"error": f"Transcription failed: {str(e)}"}), 500