Spaces:

tta1301
/

medical-chatbot

Running

App Files Files Community

tta1301 commited on Jul 24

Commit

60cfa0c

verified ·

1 Parent(s): 6020a57

Update

Browse files

Files changed (33) hide show

.env +3 -0
.idea/.gitignore +3 -0
.idea/coreRAG_chatbot.iml +8 -0
.idea/inspectionProfiles/profiles_settings.xml +6 -0
.idea/modules.xml +8 -0
.idea/vcs.xml +6 -0
.idea/workspace.xml +54 -0
ChatbotMedical.egg-info/PKG-INFO +9 -0
ChatbotMedical.egg-info/SOURCES.txt +8 -0
ChatbotMedical.egg-info/dependency_links.txt +1 -0
ChatbotMedical.egg-info/top_level.txt +1 -0
DataChatbot/Data_baiviet_benh_vinmec_2025-07-22.txt +0 -0
__pycache__/configs.cpython-310.pyc +0 -0
__pycache__/configs.cpython-313.pyc +0 -0
__pycache__/store_index.cpython-310.pyc +0 -0
__pycache__/store_index.cpython-313.pyc +0 -0
app.py +99 -0
configs.py +14 -0
requirements.txt +15 -0
research/trials.ipynb +0 -0
setup.py +14 -0
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-310.pyc +0 -0
src/__pycache__/__init__.cpython-313.pyc +0 -0
src/__pycache__/helper.cpython-310.pyc +0 -0
src/__pycache__/helper.cpython-313.pyc +0 -0
src/__pycache__/prompt.cpython-310.pyc +0 -0
src/__pycache__/prompt.cpython-313.pyc +0 -0
src/helper.py +69 -0
src/prompt.py +9 -0
store_index.py +79 -0
template.py +36 -0
trained_files.log +1 -0

.env ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ PINECONE_API_KEY=pcsk_5a1e4W_7ZNXqosj67HTMf9ggeyfAW9wfxteTWfaYxqgDwhjCjDyuDJ5A37JnFNqatzQEHQ
2	+ DEEPSEEK_API_KEY=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
3	+

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+# Default ignored files
+/shelf/
+/workspace.xml

.idea/coreRAG_chatbot.iml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>

.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/coreRAG_chatbot.iml" filepath="$PROJECT_DIR$/.idea/coreRAG_chatbot.iml" />
+    </modules>
+  </component>
+</project>

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$/.." vcs="Git" />
+  </component>
+</project>

.idea/workspace.xml ADDED Viewed

	@@ -0,0 +1,54 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="AutoImportSettings">
+    <option name="autoReloadType" value="SELECTIVE" />
+  </component>
+  <component name="ChangeListManager">
+    <list default="true" id="3da4a5a9-1ccb-4398-b85d-b48b0ea6f952" name="Changes" comment="">
+      <change beforePath="$PROJECT_DIR$/../BE/SpringBoot_Medical_News/src/main/java/com/theanh1301/SpringBoot_Medical_News/repository/UserRepository.java" beforeDir="false" afterPath="$PROJECT_DIR$/../BE/SpringBoot_Medical_News/src/main/java/com/theanh1301/SpringBoot_Medical_News/repository/UserRepository.java" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/research/trials.ipynb" beforeDir="false" afterPath="$PROJECT_DIR$/research/trials.ipynb" afterDir="false" />
+    </list>
+    <option name="SHOW_DIALOG" value="false" />
+    <option name="HIGHLIGHT_CONFLICTS" value="true" />
+    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
+    <option name="LAST_RESOLUTION" value="IGNORE" />
+  </component>
+  <component name="Git.Settings">
+    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$/.." />
+  </component>
+  <component name="ProjectColorInfo">{
+  &quot;associatedIndex&quot;: 4
+}</component>
+  <component name="ProjectId" id="30B97tAgogsByivUWQZ4PQ1jA8y" />
+  <component name="ProjectViewState">
+    <option name="hideEmptyMiddlePackages" value="true" />
+    <option name="showLibraryContents" value="true" />
+  </component>
+  <component name="PropertiesComponent"><![CDATA[{
+  "keyToString": {
+    "ModuleVcsDetector.initialDetectionPerformed": "true",
+    "RunOnceActivity.ShowReadmeOnStart": "true",
+    "RunOnceActivity.git.unshallow": "true",
+    "git-widget-placeholder": "crawl__selenium",
+    "ignore.virus.scanning.warn.message": "true",
+    "last_opened_file_path": "E:/SpringBoot/DoAnNganh/Medical_News/coreRAG_chatbot"
+  }
+}]]></component>
+  <component name="SharedIndexes">
+    <attachedChunks>
+      <set>
+        <option value="bundled-python-sdk-890ed5b35930-d9c5bdb153f4-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-251.23774.444" />
+      </set>
+    </attachedChunks>
+  </component>
+  <component name="TaskManager">
+    <task active="true" id="Default" summary="Default task">
+      <changelist id="3da4a5a9-1ccb-4398-b85d-b48b0ea6f952" name="Changes" comment="" />
+      <created>1753089934892</created>
+      <option name="number" value="Default" />
+      <option name="presentableId" value="Default" />
+      <updated>1753089934892</updated>
+    </task>
+    <servers />
+  </component>
+</project>

ChatbotMedical.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,9 @@

+Metadata-Version: 2.4
+Name: ChatbotMedical
+Version: 0.1
+Summary: A chatbot for medical news
+Author: Theanh13012004
+Author-email: [email protected]
+Dynamic: author
+Dynamic: author-email
+Dynamic: summary

ChatbotMedical.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+setup.py
+ChatbotMedical.egg-info/PKG-INFO
+ChatbotMedical.egg-info/SOURCES.txt
+ChatbotMedical.egg-info/dependency_links.txt
+ChatbotMedical.egg-info/top_level.txt
+src/__init__.py
+src/helper.py
+src/prompt.py

ChatbotMedical.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

ChatbotMedical.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ src

DataChatbot/Data_baiviet_benh_vinmec_2025-07-22.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

__pycache__/configs.cpython-310.pyc ADDED Viewed

Binary file (453 Bytes). View file

__pycache__/configs.cpython-313.pyc ADDED Viewed

Binary file (791 Bytes). View file

__pycache__/store_index.cpython-310.pyc ADDED Viewed

Binary file (1.6 kB). View file

__pycache__/store_index.cpython-313.pyc ADDED Viewed

Binary file (2.27 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from flask import Flask, jsonify , request
+from openai import embeddings
+from flask_apscheduler import APScheduler
+from src.helper import download_hugging_face_embeddings
+from langchain_pinecone import PineconeVectorStore
+from langchain_openai import ChatOpenAI
+from langchain.chains import create_retrieval_chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from configs import *
+from langchain_core.prompts import ChatPromptTemplate
+from src.prompt import *
+from store_index import *
+from sentence_transformers import CrossEncoder
+app = Flask(__name__)
+scheduler = APScheduler()
+scheduler.init_app(app)
+embeddings = download_hugging_face_embeddings()
+docsearch = PineconeVectorStore.from_existing_index(
+    index_name=INDEX_NAME,  #nếu mà đã chạy tạo db rồi thì thay bằng "chatbot"
+    embedding=embeddings
+)
+retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":40})  #search_type="similarity tìm kiếm theo cosin  ,
+#as_retrieve biến docsearch thành bộ tìm kiếm , .. k =3 -> tìm 3 giá trị gần nhất
+cross_encoder = CrossEncoder(MODEL_CROSS_ENCODER_NAME)
+llm = ChatOpenAI(
+    model=MODEL_LLM_NAME,
+    openai_api_key=DEEPSEEK_API_KEY,
+    openai_api_base="https://openrouter.ai/api/v1",
+    temperature=0.4,
+    max_tokens=2048
+) #độ sáng tạo là 0.4 và số kí tự tối đa là 500
+prompt = ChatPromptTemplate.from_messages(
+    [
+        ("system", system_prompt),
+        ("human", "{input}")  # dữ liệu vào
+    ]
+)
+question_answer_chain = create_stuff_documents_chain(llm,prompt)
+def rerank_documents(query, docs, top_n=5):
+    pairs = [[query, doc.page_content] for doc in docs]
+    scores = cross_encoder.predict(pairs)
+    reranked = sorted(zip(docs, scores), key=lambda x: x[1], reverse=True)
+    return [doc for doc, _ in reranked[:top_n]]
+@app.route('/chat_chatbot',methods=['POST'])
+def chat_chatbot():
+    data = request.json
+    user_input = data.get("msg")
+    #Lấy top - k
+    similar_docs = retriever.invoke(user_input)
+    top_docs = rerank_documents(user_input, similar_docs, top_n=5)
+    response = question_answer_chain.invoke({
+        "input": user_input,
+        "context": top_docs
+    })
+    return jsonify({"answer": response})
+@app.route("/train_new_files", methods=["POST"])
+def train_api():
+    res = train_new_files()
+    return jsonify({"status": res})
+@scheduler.task('interval', id='train_job',hours=6)
+def scheduled_train():
+    print("Scheduled training started...")
+    train_new_files()
+scheduler.start()
+if __name__ == '__main__':
+    app.run(host="0.0.0.0", port=8080, debug=True)

configs.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import os
+from dotenv import load_dotenv
+load_dotenv()
+PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
+DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
+PINECONE_ENV = os.getenv("PINECONE_ENV", "us-east1-gcp")
+MODEL_LLM_NAME ="deepseek/deepseek-r1-distill-llama-70b:free"
+MODEL_EMBEDING_NAME ="dangvantuan/vietnamese-embedding"
+MODEL_CROSS_ENCODER_NAME = "itdainb/PhoRanker"
+INDEX_NAME = "medical-chatbot"
+DATA_FOLDER = "./DataChatbot/"
+TRAINED_LOG = "trained_files.log"  #Nếu đã từng train thì sẽ lưu tên file lại vào đây

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+sentence-transformers
+pyvi
+langchain
+flask
+flask-apscheduler
+pypdf
+python-dotenv
+pinecone[grpc]
+langchain-pinecone
+langchain_community
+langchain_openai
+langchain_experimental
+langchain-huggingface
+-e .

research/trials.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

setup.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from setuptools import setup, find_packages
+setup(
+    name='ChatbotMedical',
+    version='0.1',
+    author='Theanh13012004',
+    author_email='[email protected]',
+    description='A chatbot for medical news',
+    packages=find_packages(),
+    install_requires=[
+    ],
+)

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (161 Bytes). View file

src/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (165 Bytes). View file

src/__pycache__/helper.cpython-310.pyc ADDED Viewed

Binary file (1.9 kB). View file

src/__pycache__/helper.cpython-313.pyc ADDED Viewed

Binary file (2.93 kB). View file

src/__pycache__/prompt.cpython-310.pyc ADDED Viewed

Binary file (514 Bytes). View file

src/__pycache__/prompt.cpython-313.pyc ADDED Viewed

Binary file (675 Bytes). View file

src/helper.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from langchain_community.document_loaders import DirectoryLoader, UnstructuredWordDocumentLoader , TextLoader #Đã update
+from langchain_huggingface import HuggingFaceEmbeddings # Đã update
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+import re
+import os
+#Load data
+def load_word_files(data):
+    loader = DirectoryLoader(
+        path=data,
+        glob="*.txt",
+        loader_cls=lambda path: TextLoader(path, encoding='utf-8') #loại file cần load
+    )
+    documents = loader.load()
+    return documents
+#Preprocess data
+def preprocess_data(text):
+    #Xóa URL
+    text = re.sub(r'(https?://\S+|www\.\S+)', '', text)
+    # Xoá các dòng chỉ chứa dấu = hoặc -
+    text = re.sub(r'^[=\-]{2,}\s*$', '', text, flags=re.MULTILINE)
+    #Xoá các ký tự bảng markdown (|, ---)
+    text = re.sub(r'\|.*?\|', '', text)
+    # Xoá emoji và ký tự Unicode không cần thiết
+    text = re.sub(r'[^\w\s,.!?à-ỹÀ-Ỹ\-–]', '', text)
+    # Xoá khoảng trắng thừa và dòng trống
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+#Tách thành các chunk
+def text_split(cleaned_data):
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=20)
+    text_chunks = text_splitter.split_documents(cleaned_data)
+    return text_chunks
+def download_hugging_face_embeddings():
+    embeddings = HuggingFaceEmbeddings(
+        model_name="dangvantuan/vietnamese-embedding"
+    )
+    return embeddings
+#n8n
+#Kiểm trả file train chưa
+def is_file_trained(file_name,trained_files_log):
+    if not os.path.exists(trained_files_log):   # Nếu file log chưa tồn tại
+        return False
+    with open(trained_files_log, 'r' ,encoding='utf-8') as f:
+        trained_files = f.read().splitlines()  #lấy ds các tên file đã train
+    return file_name in trained_files  #Xem file_name có trong đó không
+#Đánh dấu file đã train
+def mark_file_trained(file_name , trained_files_log):
+    with open(trained_files_log, 'a', encoding='utf-8') as f: # mở file append
+        f.write(f"{file_name}\n")  #ghi tên file vào log khi đã train xong

src/prompt.py ADDED Viewed

	@@ -0,0 +1,9 @@

+system_prompt = (
+    "Bạn là một trợ lý để trả lời các câu hỏi y tế. "
+    "Hãy sử dụng các đoạn ngữ cảnh được truy xuất sau đây để trả lời câu hỏi."
+    "Nếu bạn không biết câu trả lời, hãy nói rằng bạn không biết."
+    "Dựa trên tất cả tài liệu liên quan (có thể đến từ nhiều nguồn), hãy tổng hợp thông tin một cách thống nhất. " #Để tránh tài liệu không nhất quán (bên nói bệnh lây , bênh không lây)
+    "Trả lời tối đa ba câu và giữ cho câu trả lời ngắn gọn."
+    "\n\n"
+    "{context}"
+)

store_index.py ADDED Viewed

	@@ -0,0 +1,79 @@

+from src.helper import *
+from pinecone.grpc import PineconeGRPC as Pinecone
+from langchain.schema import Document
+from pinecone import ServerlessSpec
+from langchain_pinecone import PineconeVectorStore
+from dotenv import load_dotenv
+import os
+from configs import *
+#Tải model embeddings vietnamess
+embeddings = download_hugging_face_embeddings()
+pc = Pinecone(api_key=PINECONE_API_KEY)
+def train_new_files():
+    #Gọi để gán dữ liệu
+    all_docs = load_word_files(data=DATA_FOLDER)
+    new_docs = [] #dữ liệu sẽ đc check
+    #Tiền xử lý dữ liệu
+    for doc in all_docs:
+        file_name = doc.metadata.get("source", "unknown.docx") #nếu metadata không có thì sẽ tên là unknown.docx -> langchain tự gán (tại vì sẽ không biết tên file sắp train)
+        if not is_file_trained(file_name, TRAINED_LOG):  #chưa đc train
+            print(f"Phát hiện có file mới và training: {file_name}")
+            cleaned_content = preprocess_data(doc.page_content)  #tiền xử lý dữ liệu
+            cleaned_doc = Document(
+                page_content=cleaned_content,
+                metadata=doc.metadata
+            )
+            new_docs.append(cleaned_doc)
+            #đánh dấu đã train
+            mark_file_trained(file_name, TRAINED_LOG)
+        else:
+            print(f"File đã được train trước đó: {file_name}")
+    if not new_docs:
+        return "Không có file mới nào để train"
+    #Tạo chunk
+    text_chunks = text_split(new_docs)
+    #Tạo db
+    if INDEX_NAME not in pc.list_indexes():
+        pc.create_index(
+            name=INDEX_NAME,
+            dimension=768,
+            metric="cosine",
+            spec=ServerlessSpec(cloud="aws", region="us-east-1")
+        )
+    PineconeVectorStore.from_documents(
+        documents=text_chunks,
+        index_name=INDEX_NAME,
+        embedding=embeddings
+    )
+#Chỉ chạy 1 lần đầu tạo db
+if __name__ == "__main__":
+    result = train_new_files()
+    print(result)
+#Chạy store_index.py để tạo db cho lần đầu

template.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import os
+from pathlib import Path #đường dẫn với mọi hđh
+import logging
+logging.basicConfig(level=logging.INFO, format='[%(asctime)s]: %(message)s:')
+list_dir = [
+    "src/__init__.py",
+    "src/helper.py",
+    "src/prompt.py",
+    ".env",
+    "setup.py",
+    "store_index.py",
+    "app.py",
+    "research/trials.ipynb",
+]
+for filepath in list_dir:
+    filepath = Path(filepath) #Path sẽ đưa đc đg dẫn cho windows , linux ,mac...  -> kiểu đường dẫn tuyệt đối
+    filedir , filename = os.path.split(filepath)
+    if filedir !="":
+        os.makedirs(filedir,exist_ok=True)
+        logging.info(f"Đang tạo thư mục:{filedir} với các file: {filename}")
+    if (not os.path.exists(filepath) or (os.path.getsize(filepath)==0)):
+        with open(filepath , "w") as f:
+            pass
+            logging.info(f'Đang tạo file {filename}')
+    else:
+        logging.info(f"{filename} đã tồn tại" )

trained_files.log ADDED Viewed

	@@ -0,0 +1 @@


1	+ DataChatbot\Data_baiviet_benh_vinmec_2025-07-22.txt