Spaces:
Running
Running
File size: 4,270 Bytes
6378099 4f56acc 6378099 169746e 6378099 169746e 32563c3 169746e 6378099 9e6cedb 32563c3 33860c8 6378099 32563c3 6378099 33860c8 32563c3 9e6cedb 32563c3 9e6cedb 6378099 32563c3 6378099 9e6cedb 32563c3 9e6cedb 32563c3 9e6cedb 32563c3 9e6cedb 32563c3 9e6cedb 32563c3 9e6cedb 32563c3 9e6cedb 4f56acc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
# from flask import Flask, request, jsonify
# from sentence_transformers import CrossEncoder
# app = Flask(__name__)
# # Load your cross-encoder model
# model_name = "truong1301/reranker_pho_BLAI" # Replace with your actual model if different
# cross_encoder = CrossEncoder(model_name, max_length=256, num_labels=1)
# # Function to preprocess text with Vietnamese word segmentation
# def preprocess_text(text):
# if not text:
# return text
# segmented_text = rdrsegmenter.word_segment(text)
# # Join tokenized sentences into a single string
# return " ".join([" ".join(sentence) for sentence in segmented_text])
# @app.route("/rerank", methods=["POST"])
# def rerank():
# try:
# # Get JSON data from the request (query and list of documents)
# data = request.get_json()
# query = data.get("query", "")
# documents = data.get("documents", [])
# if not query or not documents:
# return jsonify({"error": "Missing query or documents"}), 400
# # Create pairs of query and documents for reranking
# query_doc_pairs = [(query, doc) for doc in documents]
# # Get reranking scores from the cross-encoder
# scores = cross_encoder.predict(query_doc_pairs).tolist()
# # Combine documents with their scores and sort
# ranked_results = sorted(
# [{"document": doc, "score": score} for doc, score in zip(documents, scores)],
# key=lambda x: x["score"],
# reverse=True
# )
# return jsonify({"results": ranked_results})
# except Exception as e:
# return jsonify({"error": str(e)}), 500
# @app.route("/", methods=["GET"])
# def health_check():
# return jsonify({"status": "Server is running"}), 200
# if __name__ == "__main__":
# app.run(host="0.0.0.0", port=7860) # Default port for Hugging Face Spaces
from flask import Flask, request, jsonify
from transformers import pipeline
from sentence_transformers import CrossEncoder
app = Flask(__name__)
# Load Vietnamese word segmentation pipeline
segmenter = pipeline("token-classification", model="NlpHUST/vi-word-segmentation")
# Load your cross-encoder model
model_name = "truong1301/reranker_pho_BLAI" # Replace with your actual model if different
cross_encoder = CrossEncoder(model_name, max_length=256, num_labels=1)
# Function to preprocess text using Vietnamese word segmentation
def preprocess_text(text):
if not text:
return text
ner_results = segmenter(text)
segmented_text = ""
for e in ner_results:
if "##" in e["word"]:
segmented_text += e["word"].replace("##", "")
elif e["entity"] == "I":
segmented_text += "_" + e["word"]
else:
segmented_text += " " + e["word"]
return segmented_text.strip()
@app.route("/rerank", methods=["POST"])
def rerank():
try:
# Get JSON data from the request (query and list of documents)
data = request.get_json()
query = data.get("query", "")
documents = data.get("documents", [])
if not query or not documents:
return jsonify({"error": "Missing query or documents"}), 400
# Apply Vietnamese word segmentation preprocessing
segmented_query = preprocess_text(query)
segmented_documents = [preprocess_text(doc) for doc in documents]
# Create pairs of query and documents for reranking
query_doc_pairs = [(segmented_query, doc) for doc in segmented_documents]
# Get reranking scores from the cross-encoder
scores = cross_encoder.predict(query_doc_pairs).tolist()
# Combine documents with their scores and sort
ranked_results = sorted(
[{"document": doc, "score": score} for doc, score in zip(documents, scores)],
key=lambda x: x["score"],
reverse=True
)
return jsonify({"results": ranked_results})
except Exception as e:
return jsonify({"error": str(e)}), 500
@app.route("/", methods=["GET"])
def health_check():
return jsonify({"status": "Server is running"}), 200
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860) # Default port for Hugging Face Spaces
|