from flask import Flask, request, jsonify from threading import Thread from transformers import AutoTokenizer, AutoModelForCausalLM import gradio as gr import torch # ✅ 加载模型 model_name = "Qwen/Qwen1.5-1.8B-Chat" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True) # ✅ 核心推理函数 def chat_qwen(message): if not message.strip(): return "请输入内容。" messages = [ {"role": "system", "content": "你是一个礼貌、聪明、会说中文的助手。"}, {"role": "user", "content": message} ] inputs = tokenizer.apply_chat_template(messages, return_tensors="pt") outputs = model.generate( inputs, max_new_tokens=100, do_sample=True, top_p=0.8, temperature=0.7, repetition_penalty=1.2, pad_token_id=tokenizer.eos_token_id ) decoded = tokenizer.decode(outputs[0], skip_special_tokens=True) return decoded.split("assistant")[-1].strip() # ✅ Flask 后端 POST API app = Flask(__name__) @app.route("/chat", methods=["POST"]) def chat_api(): data = request.get_json() user_input = data.get("user_message", "") reply = chat_qwen(user_input) return jsonify({"response": reply}) # ✅ Gradio UI 展示 def run_gradio(): gr.Interface(fn=chat_qwen, inputs="text", outputs="text", title="Qwen 中文助手", description="和 Qwen1.5 模型聊天").launch(server_name="0.0.0.0", server_port=7860) # ✅ 同时启动 Gradio + Flask if __name__ == "__main__": Thread(target=run_gradio).start() app.run(host="0.0.0.0", port=5000)