tta1301 commited on
Commit
60cfa0c
·
verified ·
1 Parent(s): 6020a57
.env ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ PINECONE_API_KEY=pcsk_5a1e4W_7ZNXqosj67HTMf9ggeyfAW9wfxteTWfaYxqgDwhjCjDyuDJ5A37JnFNqatzQEHQ
2
+ DEEPSEEK_API_KEY=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
3
+
.idea/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
.idea/coreRAG_chatbot.iml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="PYTHON_MODULE" version="4">
3
+ <component name="NewModuleRootManager">
4
+ <content url="file://$MODULE_DIR$" />
5
+ <orderEntry type="inheritedJdk" />
6
+ <orderEntry type="sourceFolder" forTests="false" />
7
+ </component>
8
+ </module>
.idea/inspectionProfiles/profiles_settings.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <settings>
3
+ <option name="USE_PROJECT_PROFILE" value="false" />
4
+ <version value="1.0" />
5
+ </settings>
6
+ </component>
.idea/modules.xml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/coreRAG_chatbot.iml" filepath="$PROJECT_DIR$/.idea/coreRAG_chatbot.iml" />
6
+ </modules>
7
+ </component>
8
+ </project>
.idea/vcs.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="VcsDirectoryMappings">
4
+ <mapping directory="$PROJECT_DIR$/.." vcs="Git" />
5
+ </component>
6
+ </project>
.idea/workspace.xml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="AutoImportSettings">
4
+ <option name="autoReloadType" value="SELECTIVE" />
5
+ </component>
6
+ <component name="ChangeListManager">
7
+ <list default="true" id="3da4a5a9-1ccb-4398-b85d-b48b0ea6f952" name="Changes" comment="">
8
+ <change beforePath="$PROJECT_DIR$/../BE/SpringBoot_Medical_News/src/main/java/com/theanh1301/SpringBoot_Medical_News/repository/UserRepository.java" beforeDir="false" afterPath="$PROJECT_DIR$/../BE/SpringBoot_Medical_News/src/main/java/com/theanh1301/SpringBoot_Medical_News/repository/UserRepository.java" afterDir="false" />
9
+ <change beforePath="$PROJECT_DIR$/research/trials.ipynb" beforeDir="false" afterPath="$PROJECT_DIR$/research/trials.ipynb" afterDir="false" />
10
+ </list>
11
+ <option name="SHOW_DIALOG" value="false" />
12
+ <option name="HIGHLIGHT_CONFLICTS" value="true" />
13
+ <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
14
+ <option name="LAST_RESOLUTION" value="IGNORE" />
15
+ </component>
16
+ <component name="Git.Settings">
17
+ <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$/.." />
18
+ </component>
19
+ <component name="ProjectColorInfo">{
20
+ &quot;associatedIndex&quot;: 4
21
+ }</component>
22
+ <component name="ProjectId" id="30B97tAgogsByivUWQZ4PQ1jA8y" />
23
+ <component name="ProjectViewState">
24
+ <option name="hideEmptyMiddlePackages" value="true" />
25
+ <option name="showLibraryContents" value="true" />
26
+ </component>
27
+ <component name="PropertiesComponent"><![CDATA[{
28
+ "keyToString": {
29
+ "ModuleVcsDetector.initialDetectionPerformed": "true",
30
+ "RunOnceActivity.ShowReadmeOnStart": "true",
31
+ "RunOnceActivity.git.unshallow": "true",
32
+ "git-widget-placeholder": "crawl__selenium",
33
+ "ignore.virus.scanning.warn.message": "true",
34
+ "last_opened_file_path": "E:/SpringBoot/DoAnNganh/Medical_News/coreRAG_chatbot"
35
+ }
36
+ }]]></component>
37
+ <component name="SharedIndexes">
38
+ <attachedChunks>
39
+ <set>
40
+ <option value="bundled-python-sdk-890ed5b35930-d9c5bdb153f4-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-251.23774.444" />
41
+ </set>
42
+ </attachedChunks>
43
+ </component>
44
+ <component name="TaskManager">
45
+ <task active="true" id="Default" summary="Default task">
46
+ <changelist id="3da4a5a9-1ccb-4398-b85d-b48b0ea6f952" name="Changes" comment="" />
47
+ <created>1753089934892</created>
48
+ <option name="number" value="Default" />
49
+ <option name="presentableId" value="Default" />
50
+ <updated>1753089934892</updated>
51
+ </task>
52
+ <servers />
53
+ </component>
54
+ </project>
ChatbotMedical.egg-info/PKG-INFO ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: ChatbotMedical
3
+ Version: 0.1
4
+ Summary: A chatbot for medical news
5
+ Author: Theanh13012004
6
+ Author-email: [email protected]
7
+ Dynamic: author
8
+ Dynamic: author-email
9
+ Dynamic: summary
ChatbotMedical.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ setup.py
2
+ ChatbotMedical.egg-info/PKG-INFO
3
+ ChatbotMedical.egg-info/SOURCES.txt
4
+ ChatbotMedical.egg-info/dependency_links.txt
5
+ ChatbotMedical.egg-info/top_level.txt
6
+ src/__init__.py
7
+ src/helper.py
8
+ src/prompt.py
ChatbotMedical.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
ChatbotMedical.egg-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ src
DataChatbot/Data_baiviet_benh_vinmec_2025-07-22.txt ADDED
The diff for this file is too large to render. See raw diff
 
__pycache__/configs.cpython-310.pyc ADDED
Binary file (453 Bytes). View file
 
__pycache__/configs.cpython-313.pyc ADDED
Binary file (791 Bytes). View file
 
__pycache__/store_index.cpython-310.pyc ADDED
Binary file (1.6 kB). View file
 
__pycache__/store_index.cpython-313.pyc ADDED
Binary file (2.27 kB). View file
 
app.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, jsonify , request
2
+ from openai import embeddings
3
+ from flask_apscheduler import APScheduler
4
+ from src.helper import download_hugging_face_embeddings
5
+ from langchain_pinecone import PineconeVectorStore
6
+ from langchain_openai import ChatOpenAI
7
+ from langchain.chains import create_retrieval_chain
8
+ from langchain.chains.combine_documents import create_stuff_documents_chain
9
+ from configs import *
10
+ from langchain_core.prompts import ChatPromptTemplate
11
+ from src.prompt import *
12
+ from store_index import *
13
+ from sentence_transformers import CrossEncoder
14
+
15
+
16
+
17
+ app = Flask(__name__)
18
+ scheduler = APScheduler()
19
+ scheduler.init_app(app)
20
+
21
+
22
+
23
+ embeddings = download_hugging_face_embeddings()
24
+
25
+
26
+
27
+ docsearch = PineconeVectorStore.from_existing_index(
28
+ index_name=INDEX_NAME, #nếu mà đã chạy tạo db rồi thì thay bằng "chatbot"
29
+ embedding=embeddings
30
+ )
31
+
32
+ retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":40}) #search_type="similarity tìm kiếm theo cosin ,
33
+ #as_retrieve biến docsearch thành bộ tìm kiếm , .. k =3 -> tìm 3 giá trị gần nhất
34
+
35
+ cross_encoder = CrossEncoder(MODEL_CROSS_ENCODER_NAME)
36
+
37
+
38
+
39
+
40
+ llm = ChatOpenAI(
41
+ model=MODEL_LLM_NAME,
42
+ openai_api_key=DEEPSEEK_API_KEY,
43
+ openai_api_base="https://openrouter.ai/api/v1",
44
+ temperature=0.4,
45
+ max_tokens=2048
46
+ ) #độ sáng tạo là 0.4 và số kí tự tối đa là 500
47
+
48
+ prompt = ChatPromptTemplate.from_messages(
49
+ [
50
+ ("system", system_prompt),
51
+ ("human", "{input}") # dữ liệu vào
52
+ ]
53
+ )
54
+
55
+
56
+
57
+ question_answer_chain = create_stuff_documents_chain(llm,prompt)
58
+
59
+ def rerank_documents(query, docs, top_n=5):
60
+ pairs = [[query, doc.page_content] for doc in docs]
61
+ scores = cross_encoder.predict(pairs)
62
+ reranked = sorted(zip(docs, scores), key=lambda x: x[1], reverse=True)
63
+ return [doc for doc, _ in reranked[:top_n]]
64
+
65
+
66
+
67
+
68
+
69
+ @app.route('/chat_chatbot',methods=['POST'])
70
+ def chat_chatbot():
71
+ data = request.json
72
+ user_input = data.get("msg")
73
+ #Lấy top - k
74
+ similar_docs = retriever.invoke(user_input)
75
+ top_docs = rerank_documents(user_input, similar_docs, top_n=5)
76
+ response = question_answer_chain.invoke({
77
+ "input": user_input,
78
+ "context": top_docs
79
+ })
80
+
81
+ return jsonify({"answer": response})
82
+
83
+ @app.route("/train_new_files", methods=["POST"])
84
+ def train_api():
85
+ res = train_new_files()
86
+ return jsonify({"status": res})
87
+
88
+
89
+ @scheduler.task('interval', id='train_job',hours=6)
90
+ def scheduled_train():
91
+ print("Scheduled training started...")
92
+ train_new_files()
93
+
94
+ scheduler.start()
95
+
96
+ if __name__ == '__main__':
97
+ app.run(host="0.0.0.0", port=8080, debug=True)
98
+
99
+
configs.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ load_dotenv()
5
+
6
+ PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
7
+ DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
8
+ PINECONE_ENV = os.getenv("PINECONE_ENV", "us-east1-gcp")
9
+ MODEL_LLM_NAME ="deepseek/deepseek-r1-distill-llama-70b:free"
10
+ MODEL_EMBEDING_NAME ="dangvantuan/vietnamese-embedding"
11
+ MODEL_CROSS_ENCODER_NAME = "itdainb/PhoRanker"
12
+ INDEX_NAME = "medical-chatbot"
13
+ DATA_FOLDER = "./DataChatbot/"
14
+ TRAINED_LOG = "trained_files.log" #Nếu đã từng train thì sẽ lưu tên file lại vào đây
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ sentence-transformers
2
+ pyvi
3
+ langchain
4
+ flask
5
+ flask-apscheduler
6
+ pypdf
7
+ python-dotenv
8
+ pinecone[grpc]
9
+ langchain-pinecone
10
+ langchain_community
11
+ langchain_openai
12
+ langchain_experimental
13
+ langchain-huggingface
14
+
15
+ -e .
research/trials.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
setup.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import setup, find_packages
2
+
3
+
4
+ setup(
5
+ name='ChatbotMedical',
6
+ version='0.1',
7
+ author='Theanh13012004',
8
+ author_email='[email protected]',
9
+ description='A chatbot for medical news',
10
+ packages=find_packages(),
11
+ install_requires=[
12
+ ],
13
+
14
+ )
src/__init__.py ADDED
File without changes
src/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (161 Bytes). View file
 
src/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (165 Bytes). View file
 
src/__pycache__/helper.cpython-310.pyc ADDED
Binary file (1.9 kB). View file
 
src/__pycache__/helper.cpython-313.pyc ADDED
Binary file (2.93 kB). View file
 
src/__pycache__/prompt.cpython-310.pyc ADDED
Binary file (514 Bytes). View file
 
src/__pycache__/prompt.cpython-313.pyc ADDED
Binary file (675 Bytes). View file
 
src/helper.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import DirectoryLoader, UnstructuredWordDocumentLoader , TextLoader #Đã update
2
+ from langchain_huggingface import HuggingFaceEmbeddings # Đã update
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ import re
5
+ import os
6
+
7
+
8
+
9
+ #Load data
10
+
11
+
12
+ def load_word_files(data):
13
+ loader = DirectoryLoader(
14
+ path=data,
15
+ glob="*.txt",
16
+ loader_cls=lambda path: TextLoader(path, encoding='utf-8') #loại file cần load
17
+ )
18
+ documents = loader.load()
19
+ return documents
20
+
21
+
22
+
23
+ #Preprocess data
24
+ def preprocess_data(text):
25
+
26
+
27
+ #Xóa URL
28
+ text = re.sub(r'(https?://\S+|www\.\S+)', '', text)
29
+ # Xoá các dòng chỉ chứa dấu = hoặc -
30
+ text = re.sub(r'^[=\-]{2,}\s*$', '', text, flags=re.MULTILINE)
31
+ #Xoá các ký tự bảng markdown (|, ---)
32
+ text = re.sub(r'\|.*?\|', '', text)
33
+ # Xoá emoji và ký tự Unicode không cần thiết
34
+ text = re.sub(r'[^\w\s,.!?à-ỹÀ-Ỹ\-–]', '', text)
35
+ # Xoá khoảng trắng thừa và dòng trống
36
+ text = re.sub(r'\s+', ' ', text).strip()
37
+ return text
38
+
39
+
40
+ #Tách thành các chunk
41
+ def text_split(cleaned_data):
42
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=20)
43
+ text_chunks = text_splitter.split_documents(cleaned_data)
44
+ return text_chunks
45
+
46
+
47
+ def download_hugging_face_embeddings():
48
+ embeddings = HuggingFaceEmbeddings(
49
+ model_name="dangvantuan/vietnamese-embedding"
50
+ )
51
+ return embeddings
52
+
53
+
54
+
55
+
56
+ #n8n
57
+
58
+ #Kiểm trả file train chưa
59
+ def is_file_trained(file_name,trained_files_log):
60
+ if not os.path.exists(trained_files_log): # Nếu file log chưa tồn tại
61
+ return False
62
+ with open(trained_files_log, 'r' ,encoding='utf-8') as f:
63
+ trained_files = f.read().splitlines() #lấy ds các tên file đã train
64
+ return file_name in trained_files #Xem file_name có trong đó không
65
+
66
+ #Đánh dấu file đã train
67
+ def mark_file_trained(file_name , trained_files_log):
68
+ with open(trained_files_log, 'a', encoding='utf-8') as f: # mở file append
69
+ f.write(f"{file_name}\n") #ghi tên file vào log khi đã train xong
src/prompt.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ system_prompt = (
2
+ "Bạn là một trợ lý để trả lời các câu hỏi y tế. "
3
+ "Hãy sử dụng các đoạn ngữ cảnh được truy xuất sau đây để trả lời câu hỏi."
4
+ "Nếu bạn không biết câu trả lời, hãy nói rằng bạn không biết."
5
+ "Dựa trên tất cả tài liệu liên quan (có thể đến từ nhiều nguồn), hãy tổng hợp thông tin một cách thống nhất. " #Để tránh tài liệu không nhất quán (bên nói bệnh lây , bênh không lây)
6
+ "Trả lời tối đa ba câu và giữ cho câu trả lời ngắn gọn."
7
+ "\n\n"
8
+ "{context}"
9
+ )
store_index.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.helper import *
2
+ from pinecone.grpc import PineconeGRPC as Pinecone
3
+ from langchain.schema import Document
4
+ from pinecone import ServerlessSpec
5
+ from langchain_pinecone import PineconeVectorStore
6
+ from dotenv import load_dotenv
7
+ import os
8
+ from configs import *
9
+
10
+
11
+
12
+ #Tải model embeddings vietnamess
13
+ embeddings = download_hugging_face_embeddings()
14
+
15
+ pc = Pinecone(api_key=PINECONE_API_KEY)
16
+
17
+
18
+
19
+ def train_new_files():
20
+ #Gọi để gán dữ liệu
21
+ all_docs = load_word_files(data=DATA_FOLDER)
22
+ new_docs = [] #dữ liệu sẽ đc check
23
+
24
+ #Tiền xử lý dữ liệu
25
+ for doc in all_docs:
26
+ file_name = doc.metadata.get("source", "unknown.docx") #nếu metadata không có thì sẽ tên là unknown.docx -> langchain tự gán (tại vì sẽ không biết tên file sắp train)
27
+ if not is_file_trained(file_name, TRAINED_LOG): #chưa đc train
28
+ print(f"Phát hiện có file mới và training: {file_name}")
29
+ cleaned_content = preprocess_data(doc.page_content) #tiền xử lý dữ liệu
30
+ cleaned_doc = Document(
31
+ page_content=cleaned_content,
32
+ metadata=doc.metadata
33
+ )
34
+ new_docs.append(cleaned_doc)
35
+ #đánh dấu đã train
36
+ mark_file_trained(file_name, TRAINED_LOG)
37
+ else:
38
+ print(f"File đã được train trước đó: {file_name}")
39
+ if not new_docs:
40
+ return "Không có file mới nào để train"
41
+
42
+ #Tạo chunk
43
+ text_chunks = text_split(new_docs)
44
+ #Tạo db
45
+ if INDEX_NAME not in pc.list_indexes():
46
+ pc.create_index(
47
+ name=INDEX_NAME,
48
+ dimension=768,
49
+ metric="cosine",
50
+ spec=ServerlessSpec(cloud="aws", region="us-east-1")
51
+ )
52
+ PineconeVectorStore.from_documents(
53
+ documents=text_chunks,
54
+ index_name=INDEX_NAME,
55
+ embedding=embeddings
56
+ )
57
+
58
+
59
+ #Chỉ chạy 1 lần đầu tạo db
60
+
61
+ if __name__ == "__main__":
62
+ result = train_new_files()
63
+ print(result)
64
+
65
+
66
+
67
+
68
+
69
+
70
+
71
+
72
+
73
+
74
+
75
+
76
+
77
+
78
+
79
+ #Chạy store_index.py để tạo db cho lần đầu
template.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path #đường dẫn với mọi hđh
3
+ import logging
4
+
5
+
6
+
7
+ logging.basicConfig(level=logging.INFO, format='[%(asctime)s]: %(message)s:')
8
+
9
+ list_dir = [
10
+ "src/__init__.py",
11
+ "src/helper.py",
12
+ "src/prompt.py",
13
+ ".env",
14
+ "setup.py",
15
+ "store_index.py",
16
+ "app.py",
17
+ "research/trials.ipynb",
18
+
19
+ ]
20
+
21
+
22
+ for filepath in list_dir:
23
+ filepath = Path(filepath) #Path sẽ đưa đc đg dẫn cho windows , linux ,mac... -> kiểu đường dẫn tuyệt đối
24
+ filedir , filename = os.path.split(filepath)
25
+
26
+ if filedir !="":
27
+ os.makedirs(filedir,exist_ok=True)
28
+ logging.info(f"Đang tạo thư mục:{filedir} với các file: {filename}")
29
+
30
+ if (not os.path.exists(filepath) or (os.path.getsize(filepath)==0)):
31
+ with open(filepath , "w") as f:
32
+ pass
33
+ logging.info(f'Đang tạo file {filename}')
34
+
35
+ else:
36
+ logging.info(f"{filename} đã tồn tại" )
trained_files.log ADDED
@@ -0,0 +1 @@
 
 
1
+ DataChatbot\Data_baiviet_benh_vinmec_2025-07-22.txt