Spaces:

deddoggo
/

chatbot_demo

Running on T4

App Files Files Community

deddoggo commited on 2 days ago

Commit

0c16fc9

1 Parent(s): b8e326d

update

Browse files

Files changed (4) hide show

app.py +53 -233
data_processor.py +86 -0
llm_handler.py → rag_pipeline.py +0 -0
retrieval_handler.py → retriever.py +0 -0

app.py CHANGED Viewed

@@ -1,238 +1,58 @@
-# app.py
-# Phiên bản triển khai cuối cùng, tải các mô hình từ Hub và các tài sản dữ liệu đã xử lý trước.
-import os
-import sys
-import json
-import re
-import pickle
-from collections import defaultdict
-# Core ML/DL và Unsloth
-import torch
-from unsloth import FastLanguageModel
-from transformers import TextStreamer
-# RAG - Retrieval
-import faiss
-from sentence_transformers import SentenceTransformer
-from rank_bm25 import BM25Okapi
-import numpy as np
-# Deployment
 import gradio as gr
-print("✅ Import thư viện thành công.")
-# --- PHẦN 1: CẤU HÌNH VÀ ĐƯỜNG DẪN ---
-# Tên các mô hình sẽ được tải từ Hub
-EMBEDDING_MODEL_NAME = "bkai-foundation-models/vietnamese-bi-encoder"
-LLM_MODEL_NAME = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"
-# Đường dẫn đến các file bạn sẽ upload lên Space
-# Tạo một thư mục 'data' trong Space để chứa chúng cho gọn gàng
-RAW_LAW_DATA_FILE = "data/luat_chi_tiet_output_openai_sdk_final_cleaned.json"
-FAISS_INDEX_FILE = "data/my_law_faiss_flatip_normalized.index"
-# Tên các file sẽ được tạo ra trong quá trình chạy (nếu chưa có)
-PROCESSED_CHUNKS_FILE = "processed_chunks.json"
-BM25_MODEL_FILE = "bm25_model.pkl"
-# Biến toàn cục để lưu trữ tài nguyên
-APP_RESOURCES = {}
-# --- PHẦN 2: CÁC HÀM TIỆN ÍCH VÀ XỬ LÝ DỮ LIỆU ---
-def process_law_data_to_chunks(structured_data):
-    """Làm phẳng dữ liệu luật có cấu trúc thành danh sách các chunks."""
-    flat_list = []
-    articles = [structured_data] if isinstance(structured_data, dict) else structured_data
-    for article_data in articles:
-        if not isinstance(article_data, dict): continue
-        clauses = article_data.get("clauses", [])
-        for clause in clauses:
-            points = clause.get("points_in_clause", [])
-            if points:
-                for point in points:
-                    text = point.get("point_text_original")
-                    if text:
-                        flat_list.append({"text": text, "metadata": {"article": article_data.get("article"), "clause": clause.get("clause_number"), "point": point.get("point_id")}})
-            else:
-                text = clause.get("clause_text_original")
-                if text:
-                    flat_list.append({"text": text, "metadata": {"article": article_data.get("article"), "clause": clause.get("clause_number")}})
-    return flat_list
-def tokenize_vi_simple(text):
-    """Tokenize tiếng Việt đơn giản."""
-    text = text.lower()
-    text = re.sub(r'[^\w\s]', '', text)
-    return text.split()
-# --- PHẦN 3: LOGIC CỐT LÕI CỦA ỨNG DỤNG ---
-def load_app_resources():
-    """Tải hoặc tạo tất cả các tài nguyên cần thiết cho ứng dụng."""
-    print("--- Bắt đầu quá trình tải tài nguyên ---")
-    # 1. Tải các mô hình AI từ Hub
-    print("1. Đang tải LLM và Embedding Model từ Hugging Face Hub...")
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    llm_model, tokenizer = FastLanguageModel.from_pretrained(
-        model_name=LLM_MODEL_NAME, max_seq_length=2048, dtype=None, load_in_4bit=True
     )
-    FastLanguageModel.for_inference(llm_model)
-    embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME, device=device)
-    APP_RESOURCES['llm_model'] = llm_model
-    APP_RESOURCES['tokenizer'] = tokenizer
-    APP_RESOURCES['embedding_model'] = embedding_model
-    print("✅ Tải mô hình AI thành công.")
-    # 2. Tải và xử lý dữ liệu luật từ file JSON bạn cung cấp
-    print(f"2. Đang tải và xử lý dữ liệu từ '{RAW_LAW_DATA_FILE}'...")
-    if not os.path.exists(RAW_LAW_DATA_FILE):
-        raise FileNotFoundError(f"Không tìm thấy file dữ liệu luật: '{RAW_LAW_DATA_FILE}'. Vui lòng tạo thư mục 'data' và upload file này lên Space.")
-    with open(RAW_LAW_DATA_FILE, 'r', encoding='utf-8') as f:
-        raw_data = json.load(f)
-    chunks_data = process_law_data_to_chunks(raw_data)
-    APP_RESOURCES['chunks_data'] = chunks_data
-    print(f"✅ Đã xử lý thành {len(chunks_data)} chunks dữ liệu.")
-    # 3. Tải FAISS Index đã được tạo sẵn
-    print(f"3. Đang tải FAISS index từ '{FAISS_INDEX_FILE}'...")
-    if not os.path.exists(FAISS_INDEX_FILE):
-        raise FileNotFoundError(f"Không tìm thấy file FAISS index: '{FAISS_INDEX_FILE}'. Vui lòng upload file này.")
-    faiss_index = faiss.read_index(FAISS_INDEX_FILE)
-    APP_RESOURCES['faiss_index'] = faiss_index
-    print(f"✅ Đã tải FAISS Index với {faiss_index.ntotal} vectors.")
-    # 4. Tải hoặc tạo BM25 Model (bước này nhanh nên có thể tạo on-the-fly)
-    if os.path.exists(BM25_MODEL_FILE):
-        print(f"4. Đang tải BM25 model từ '{BM25_MODEL_FILE}'...")
-        with open(BM25_MODEL_FILE, 'rb') as f:
-            bm25_model = pickle.load(f)
-    else:
-        print("4. Không tìm thấy BM25 model. Đang tạo mới...")
-        tokenized_corpus = [tokenize_vi_simple(c['text']) for c in chunks_data]
-        bm25_model = BM25Okapi(tokenized_corpus)
-        with open(BM25_MODEL_FILE, 'wb') as f:
-            pickle.dump(bm25_model, f)
-        print(f"✅ Đã tạo và lưu BM25 model vào '{BM25_MODEL_FILE}'.")
-    APP_RESOURCES['bm25_model'] = bm25_model
-    print("\n--- Tải tài nguyên hoàn tất! Ứng dụng đã sẵn sàng. ---")
-def search_relevant_laws(query_text, k=5):
-    """Thực hiện Hybrid Search."""
-    # (Hàm này giữ nguyên như trước, không cần thay đổi)
-    rrf_k_constant = 60
-    embedding_model = APP_RESOURCES['embedding_model']
-    faiss_index = APP_RESOURCES['faiss_index']
-    chunks_data = APP_RESOURCES['chunks_data']
-    bm25_model = APP_RESOURCES['bm25_model']
-    query_embedding = embedding_model.encode([query_text], convert_to_tensor=True)
-    query_embedding_np = query_embedding.cpu().numpy().astype('float32')
-    faiss.normalize_L2(query_embedding_np)
-    num_candidates = min(k * 10, faiss_index.ntotal)
-    _, semantic_indices = faiss_index.search(query_embedding_np, num_candidates)
-    tokenized_query = tokenize_vi_simple(query_text)
-    bm25_scores = bm25_model.get_scores(tokenized_query)
-    bm25_results = sorted(enumerate(bm25_scores), key=lambda x: x[1], reverse=True)[:num_candidates]
-    rrf_scores = defaultdict(float)
-    if semantic_indices.size > 0:
-        for rank, doc_idx in enumerate(semantic_indices[0]):
-            if doc_idx != -1: rrf_scores[doc_idx] += 1.0 / (rrf_k_constant + rank)
-    for rank, (doc_idx, score) in enumerate(bm25_results):
-        if score > 0: rrf_scores[doc_idx] += 1.0 / (rrf_k_constant + rank)
-    fused_results = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
-    final_results = []
-    for doc_idx, score in fused_results[:k]:
-        result = chunks_data[doc_idx].copy()
-        result['retrieval_score'] = score
-        final_results.append(result)
-    return final_results
-def generate_llm_response(query, context):
-    """Sinh câu trả lời từ LLM."""
-    # (Hàm này giữ nguyên như trước, không cần thay đổi)
-    llm_model = APP_RESOURCES['llm_model']
-    tokenizer = APP_RESOURCES['tokenizer']
-    prompt = f"""Bạn là một trợ lý AI chuyên tư vấn về luật giao thông đường bộ Việt Nam. Dựa vào các thông tin luật được cung cấp dưới đây để trả lời câu hỏi của người dùng một cách chính xác và chi tiết.
-### Thông tin luật được trích dẫn:
-{context}
-### Câu hỏi của người dùng:
-{query}
-### Trả lời của bạn:"""
-    inputs = tokenizer(prompt, return_tensors="pt").to(llm_model.device)
-    output_ids = llm_model.generate(**inputs, max_new_tokens=512, temperature=0.3, do_sample=True, pad_token_id=tokenizer.eos_token_id)
-    response_text = tokenizer.decode(output_ids[0][inputs.input_ids.shape[1]:], skip_special_tokens=True).strip()
-    return response_text
-# --- PHẦN 4: CÁC HÀM GIAO TIẾP VỚI GRADIO ---
-def retriever_interface(query):
-    """Giao tiếp với Tab 1: Chỉ tìm kiếm."""
-    retrieved_results = search_relevant_laws(query)
-    if not retrieved_results: return "Không tìm thấy điều luật nào liên quan."
-    output_md = "### Các điều luật liên quan nhất:\n\n"
-    for i, res in enumerate(retrieved_results):
-        meta = res.get('metadata', {})
-        text = res.get('text', 'N/A')
-        output_md += f"**{i+1}. {meta.get('article', 'N/A')} | {meta.get('clause', 'N/A')} | {meta.get('point', 'N/A')}**\n"
-        output_md += f"> {text}\n\n---\n\n"
-    return output_md
-def rag_interface(query, progress=gr.Progress()):
-    """Giao tiếp với Tab 2: RAG hoàn chỉnh."""
-    progress(0.2, desc="Đang tìm kiếm ngữ cảnh...")
-    retrieved_results = search_relevant_laws(query)
-    if not retrieved_results:
-        context_for_llm = "Không tìm thấy thông tin luật liên quan."
-        context_for_display = "Không tìm thấy điều luật nào liên quan để tạo câu trả lời."
-    else:
-        context_for_llm = "\n\n---\n\n".join([r['text'] for r in retrieved_results])
-        context_for_display = retriever_interface(query)
-    progress(0.7, desc="Đang sinh câu trả lời...")
-    final_answer = generate_llm_response(query, context_for_llm)
-    progress(1, desc="Hoàn tất!")
-    return final_answer, context_for_display
-# --- PHẦN 5: KHỞI CHẠY ỨNG DỤNG ---
-# Tải tài nguyên một lần duy nhất
-load_app_resources()
-# Xây dựng giao diện
-with gr.Blocks(theme=gr.themes.Soft(), title="Chatbot Luật GTĐB") as demo:
-    gr.Markdown("# ⚖️ Chatbot Luật Giao thông Đường bộ Việt Nam (RAG)")
-    with gr.Tabs():
-        with gr.TabItem("Tìm kiếm Điều luật (Retriever)"):
-            retriever_query = gr.Textbox(label="Nhập nội dung tìm kiếm", placeholder="Vd: Vượt đèn đỏ")
-            retriever_button = gr.Button("Tìm kiếm", variant="secondary")
-            retriever_output = gr.Markdown(label="Các điều luật liên quan")
-        with gr.TabItem("Hỏi-Đáp (RAG)"):
-            rag_query = gr.Textbox(label="Nhập câu hỏi", placeholder="Vd: Vượt đèn đỏ phạt bao nhiêu tiền?")
-            rag_button = gr.Button("Gửi", variant="primary")
-            rag_answer = gr.Textbox(label="Câu trả lời", interactive=False, lines=7)
-            with gr.Accordion("Xem ngữ cảnh đã sử dụng", open=False):
-                rag_context = gr.Markdown()
-    retriever_button.click(fn=retriever_interface, inputs=retriever_query, outputs=retriever_output)
-    rag_button.click(fn=rag_interface, inputs=rag_query, outputs=[rag_answer, rag_context])
 if __name__ == "__main__":
-    demo.launch()

+# file: app.py
 import gradio as gr
+import time
+from rag_pipeline import initialize_components, generate_response
+# --- KHỞI TẠO CÁC THÀNH PHẦN (CHỈ CHẠY 1 LẦN) ---
+start_time = time.time()
+print("Bắt đầu khởi tạo ứng dụng Chatbot Luật Giao thông...")
+DATA_PATH = "data/luat_chi_tiet_output_openai_sdk_final_cleaned.json"
+COMPONENTS = initialize_components(DATA_PATH)
+end_time = time.time()
+print(f"✅ Ứng dụng đã sẵn sàng! Thời gian khởi tạo: {end_time - start_time:.2f} giây.")
+# ----------------------------------------------------
+def chat_interface(query, history):
+    """
+    Hàm xử lý logic cho giao diện chat của Gradio.
+    """
+    print(f"Nhận được câu hỏi từ người dùng: '{query}'")
+    # Gọi hàm generate_response với query và các thành phần đã được khởi tạo
+    response = generate_response(query, COMPONENTS)
+    return response
+# --- GIAO DIỆN GRADIO ---
+with gr.Blocks(theme=gr.themes.Soft(), title="Chatbot Luật Giao thông Việt Nam") as demo:
+    gr.Markdown(
+        """
+        # ⚖️ Chatbot Luật Giao thông Việt Nam
+        Hỏi đáp về các quy định, mức phạt trong luật giao thông đường bộ dựa trên cơ sở dữ liệu được cung cấp.
+        *Lưu ý: Đây là một sản phẩm demo. Thông tin chỉ mang tính chất tham khảo.*
+        """
     )
+    chatbot = gr.Chatbot(label="Chatbot", height=500)
+    msg = gr.Textbox(label="Nhập câu hỏi của bạn", placeholder="Ví dụ: Vượt đèn đỏ bị phạt bao nhiêu tiền?")
+    clear = gr.ClearButton([msg, chatbot])
+    def respond(message, chat_history):
+        bot_message = chat_interface(message, chat_history)
+        chat_history.append((message, bot_message))
+        return "", chat_history
+    msg.submit(respond, [msg, chatbot], [msg, chatbot])
+    gr.Examples(
+        examples=[
+            "Phương tiện giao thông đường bộ gồm những loại nào?",
+            "Vượt đèn đỏ phạt bao nhiêu tiền đối với xe máy?",
+            "Nồng độ cồn cho phép khi lái xe ô tô là bao nhiêu?",
+            "Đi sai làn đường bị trừ mấy điểm bằng lái?",
+        ],
+        inputs=msg
+    )
 if __name__ == "__main__":
+    demo.launch()

data_processor.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# file: data_processor.py
+import json
+def process_law_data_to_chunks(structured_data_input):
+    """
+    Xử lý dữ liệu luật từ cấu trúc JSON lồng nhau thành một danh sách phẳng các chunks.
+    Mỗi chunk chứa text và metadata tương ứng.
+    """
+    flat_list = []
+    # Đảm bảo đầu vào là một danh sách các điều luật (articles)
+    if isinstance(structured_data_input, dict) and "article" in structured_data_input:
+        articles_list = [structured_data_input]
+    elif isinstance(structured_data_input, list):
+        articles_list = structured_data_input
+    else:
+        print("Lỗi: Dữ liệu đầu vào không phải là danh sách các Điều luật hoặc một đối tượng Điều luật.")
+        return flat_list
+    for article_data in articles_list:
+        if not isinstance(article_data, dict):
+            print(f"Cảnh báo: Bỏ qua một mục trong danh sách điều luật vì không phải là dictionary: {article_data}")
+            continue
+        article_metadata_base = {
+            "source_document": article_data.get("source_document"),
+            "article": article_data.get("article"),
+            "article_title": article_data.get("article_title")
+        }
+        clauses = article_data.get("clauses", [])
+        if not isinstance(clauses, list):
+            print(f"Cảnh báo: 'clauses' trong điều {article_metadata_base.get('article')} không phải là danh sách. Bỏ qua.")
+            continue
+        for clause_data in clauses:
+            if not isinstance(clause_data, dict):
+                print(f"Cảnh báo: Bỏ qua một mục trong 'clauses' vì không phải là dictionary: {clause_data}")
+                continue
+            clause_metadata_base = article_metadata_base.copy()
+            clause_metadata_base.update({
+                "clause_number": clause_data.get("clause_number"),
+                "clause_metadata_summary": clause_data.get("clause_metadata_summary")
+            })
+            points_in_clause = clause_data.get("points_in_clause", [])
+            if not isinstance(points_in_clause, list):
+                print(f"Cảnh báo: 'points_in_clause' trong khoản {clause_metadata_base.get('clause_number')} của điều {article_metadata_base.get('article')} không phải là danh sách. Bỏ qua.")
+                continue
+            if points_in_clause:
+                for point_data in points_in_clause:
+                    if not isinstance(point_data, dict):
+                        print(f"Cảnh báo: Bỏ qua một mục trong 'points_in_clause' vì không phải là dictionary: {point_data}")
+                        continue
+                    chunk_text = point_data.get("point_text_original") or point_data.get("violation_description_summary")
+                    if not chunk_text:
+                        continue
+                    current_point_metadata = clause_metadata_base.copy()
+                    point_specific_metadata = point_data.copy()
+                    if "point_text_original" in point_specific_metadata:
+                        del point_specific_metadata["point_text_original"]
+                    current_point_metadata.update(point_specific_metadata)
+                    final_metadata_cleaned = {k: v for k, v in current_point_metadata.items() if v is not None}
+                    flat_list.append({"text": chunk_text, "metadata": final_metadata_cleaned})
+            else:
+                chunk_text = clause_data.get("clause_text_original")
+                if chunk_text:
+                    current_clause_metadata = clause_metadata_base.copy()
+                    additional_clause_info = {}
+                    for key, value in clause_data.items():
+                        if key not in ["clause_text_original", "points_in_clause", "clause_number", "clause_metadata_summary"]:
+                            additional_clause_info[key] = value
+                    if additional_clause_info:
+                        current_clause_metadata.update(additional_clause_info)
+                    final_metadata_cleaned = {k: v for k, v in current_clause_metadata.items() if v is not None}
+                    flat_list.append({"text": chunk_text, "metadata": final_metadata_cleaned})
+    return flat_list

llm_handler.py → rag_pipeline.py RENAMED Viewed

File without changes

retrieval_handler.py → retriever.py RENAMED Viewed

File without changes