Spaces:
Running
Running
Update
Browse files- .env +3 -0
- .idea/.gitignore +3 -0
- .idea/coreRAG_chatbot.iml +8 -0
- .idea/inspectionProfiles/profiles_settings.xml +6 -0
- .idea/modules.xml +8 -0
- .idea/vcs.xml +6 -0
- .idea/workspace.xml +54 -0
- ChatbotMedical.egg-info/PKG-INFO +9 -0
- ChatbotMedical.egg-info/SOURCES.txt +8 -0
- ChatbotMedical.egg-info/dependency_links.txt +1 -0
- ChatbotMedical.egg-info/top_level.txt +1 -0
- DataChatbot/Data_baiviet_benh_vinmec_2025-07-22.txt +0 -0
- __pycache__/configs.cpython-310.pyc +0 -0
- __pycache__/configs.cpython-313.pyc +0 -0
- __pycache__/store_index.cpython-310.pyc +0 -0
- __pycache__/store_index.cpython-313.pyc +0 -0
- app.py +99 -0
- configs.py +14 -0
- requirements.txt +15 -0
- research/trials.ipynb +0 -0
- setup.py +14 -0
- src/__init__.py +0 -0
- src/__pycache__/__init__.cpython-310.pyc +0 -0
- src/__pycache__/__init__.cpython-313.pyc +0 -0
- src/__pycache__/helper.cpython-310.pyc +0 -0
- src/__pycache__/helper.cpython-313.pyc +0 -0
- src/__pycache__/prompt.cpython-310.pyc +0 -0
- src/__pycache__/prompt.cpython-313.pyc +0 -0
- src/helper.py +69 -0
- src/prompt.py +9 -0
- store_index.py +79 -0
- template.py +36 -0
- trained_files.log +1 -0
.env
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
PINECONE_API_KEY=pcsk_5a1e4W_7ZNXqosj67HTMf9ggeyfAW9wfxteTWfaYxqgDwhjCjDyuDJ5A37JnFNqatzQEHQ
|
2 |
+
DEEPSEEK_API_KEY=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
|
3 |
+
|
.idea/.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
# Default ignored files
|
2 |
+
/shelf/
|
3 |
+
/workspace.xml
|
.idea/coreRAG_chatbot.iml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<module type="PYTHON_MODULE" version="4">
|
3 |
+
<component name="NewModuleRootManager">
|
4 |
+
<content url="file://$MODULE_DIR$" />
|
5 |
+
<orderEntry type="inheritedJdk" />
|
6 |
+
<orderEntry type="sourceFolder" forTests="false" />
|
7 |
+
</component>
|
8 |
+
</module>
|
.idea/inspectionProfiles/profiles_settings.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<component name="InspectionProjectProfileManager">
|
2 |
+
<settings>
|
3 |
+
<option name="USE_PROJECT_PROFILE" value="false" />
|
4 |
+
<version value="1.0" />
|
5 |
+
</settings>
|
6 |
+
</component>
|
.idea/modules.xml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="ProjectModuleManager">
|
4 |
+
<modules>
|
5 |
+
<module fileurl="file://$PROJECT_DIR$/.idea/coreRAG_chatbot.iml" filepath="$PROJECT_DIR$/.idea/coreRAG_chatbot.iml" />
|
6 |
+
</modules>
|
7 |
+
</component>
|
8 |
+
</project>
|
.idea/vcs.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="VcsDirectoryMappings">
|
4 |
+
<mapping directory="$PROJECT_DIR$/.." vcs="Git" />
|
5 |
+
</component>
|
6 |
+
</project>
|
.idea/workspace.xml
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="AutoImportSettings">
|
4 |
+
<option name="autoReloadType" value="SELECTIVE" />
|
5 |
+
</component>
|
6 |
+
<component name="ChangeListManager">
|
7 |
+
<list default="true" id="3da4a5a9-1ccb-4398-b85d-b48b0ea6f952" name="Changes" comment="">
|
8 |
+
<change beforePath="$PROJECT_DIR$/../BE/SpringBoot_Medical_News/src/main/java/com/theanh1301/SpringBoot_Medical_News/repository/UserRepository.java" beforeDir="false" afterPath="$PROJECT_DIR$/../BE/SpringBoot_Medical_News/src/main/java/com/theanh1301/SpringBoot_Medical_News/repository/UserRepository.java" afterDir="false" />
|
9 |
+
<change beforePath="$PROJECT_DIR$/research/trials.ipynb" beforeDir="false" afterPath="$PROJECT_DIR$/research/trials.ipynb" afterDir="false" />
|
10 |
+
</list>
|
11 |
+
<option name="SHOW_DIALOG" value="false" />
|
12 |
+
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
13 |
+
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
|
14 |
+
<option name="LAST_RESOLUTION" value="IGNORE" />
|
15 |
+
</component>
|
16 |
+
<component name="Git.Settings">
|
17 |
+
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$/.." />
|
18 |
+
</component>
|
19 |
+
<component name="ProjectColorInfo">{
|
20 |
+
"associatedIndex": 4
|
21 |
+
}</component>
|
22 |
+
<component name="ProjectId" id="30B97tAgogsByivUWQZ4PQ1jA8y" />
|
23 |
+
<component name="ProjectViewState">
|
24 |
+
<option name="hideEmptyMiddlePackages" value="true" />
|
25 |
+
<option name="showLibraryContents" value="true" />
|
26 |
+
</component>
|
27 |
+
<component name="PropertiesComponent"><![CDATA[{
|
28 |
+
"keyToString": {
|
29 |
+
"ModuleVcsDetector.initialDetectionPerformed": "true",
|
30 |
+
"RunOnceActivity.ShowReadmeOnStart": "true",
|
31 |
+
"RunOnceActivity.git.unshallow": "true",
|
32 |
+
"git-widget-placeholder": "crawl__selenium",
|
33 |
+
"ignore.virus.scanning.warn.message": "true",
|
34 |
+
"last_opened_file_path": "E:/SpringBoot/DoAnNganh/Medical_News/coreRAG_chatbot"
|
35 |
+
}
|
36 |
+
}]]></component>
|
37 |
+
<component name="SharedIndexes">
|
38 |
+
<attachedChunks>
|
39 |
+
<set>
|
40 |
+
<option value="bundled-python-sdk-890ed5b35930-d9c5bdb153f4-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-251.23774.444" />
|
41 |
+
</set>
|
42 |
+
</attachedChunks>
|
43 |
+
</component>
|
44 |
+
<component name="TaskManager">
|
45 |
+
<task active="true" id="Default" summary="Default task">
|
46 |
+
<changelist id="3da4a5a9-1ccb-4398-b85d-b48b0ea6f952" name="Changes" comment="" />
|
47 |
+
<created>1753089934892</created>
|
48 |
+
<option name="number" value="Default" />
|
49 |
+
<option name="presentableId" value="Default" />
|
50 |
+
<updated>1753089934892</updated>
|
51 |
+
</task>
|
52 |
+
<servers />
|
53 |
+
</component>
|
54 |
+
</project>
|
ChatbotMedical.egg-info/PKG-INFO
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Metadata-Version: 2.4
|
2 |
+
Name: ChatbotMedical
|
3 |
+
Version: 0.1
|
4 |
+
Summary: A chatbot for medical news
|
5 |
+
Author: Theanh13012004
|
6 |
+
Author-email: [email protected]
|
7 |
+
Dynamic: author
|
8 |
+
Dynamic: author-email
|
9 |
+
Dynamic: summary
|
ChatbotMedical.egg-info/SOURCES.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
setup.py
|
2 |
+
ChatbotMedical.egg-info/PKG-INFO
|
3 |
+
ChatbotMedical.egg-info/SOURCES.txt
|
4 |
+
ChatbotMedical.egg-info/dependency_links.txt
|
5 |
+
ChatbotMedical.egg-info/top_level.txt
|
6 |
+
src/__init__.py
|
7 |
+
src/helper.py
|
8 |
+
src/prompt.py
|
ChatbotMedical.egg-info/dependency_links.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
|
ChatbotMedical.egg-info/top_level.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
src
|
DataChatbot/Data_baiviet_benh_vinmec_2025-07-22.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
__pycache__/configs.cpython-310.pyc
ADDED
Binary file (453 Bytes). View file
|
|
__pycache__/configs.cpython-313.pyc
ADDED
Binary file (791 Bytes). View file
|
|
__pycache__/store_index.cpython-310.pyc
ADDED
Binary file (1.6 kB). View file
|
|
__pycache__/store_index.cpython-313.pyc
ADDED
Binary file (2.27 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, jsonify , request
|
2 |
+
from openai import embeddings
|
3 |
+
from flask_apscheduler import APScheduler
|
4 |
+
from src.helper import download_hugging_face_embeddings
|
5 |
+
from langchain_pinecone import PineconeVectorStore
|
6 |
+
from langchain_openai import ChatOpenAI
|
7 |
+
from langchain.chains import create_retrieval_chain
|
8 |
+
from langchain.chains.combine_documents import create_stuff_documents_chain
|
9 |
+
from configs import *
|
10 |
+
from langchain_core.prompts import ChatPromptTemplate
|
11 |
+
from src.prompt import *
|
12 |
+
from store_index import *
|
13 |
+
from sentence_transformers import CrossEncoder
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
app = Flask(__name__)
|
18 |
+
scheduler = APScheduler()
|
19 |
+
scheduler.init_app(app)
|
20 |
+
|
21 |
+
|
22 |
+
|
23 |
+
embeddings = download_hugging_face_embeddings()
|
24 |
+
|
25 |
+
|
26 |
+
|
27 |
+
docsearch = PineconeVectorStore.from_existing_index(
|
28 |
+
index_name=INDEX_NAME, #nếu mà đã chạy tạo db rồi thì thay bằng "chatbot"
|
29 |
+
embedding=embeddings
|
30 |
+
)
|
31 |
+
|
32 |
+
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":40}) #search_type="similarity tìm kiếm theo cosin ,
|
33 |
+
#as_retrieve biến docsearch thành bộ tìm kiếm , .. k =3 -> tìm 3 giá trị gần nhất
|
34 |
+
|
35 |
+
cross_encoder = CrossEncoder(MODEL_CROSS_ENCODER_NAME)
|
36 |
+
|
37 |
+
|
38 |
+
|
39 |
+
|
40 |
+
llm = ChatOpenAI(
|
41 |
+
model=MODEL_LLM_NAME,
|
42 |
+
openai_api_key=DEEPSEEK_API_KEY,
|
43 |
+
openai_api_base="https://openrouter.ai/api/v1",
|
44 |
+
temperature=0.4,
|
45 |
+
max_tokens=2048
|
46 |
+
) #độ sáng tạo là 0.4 và số kí tự tối đa là 500
|
47 |
+
|
48 |
+
prompt = ChatPromptTemplate.from_messages(
|
49 |
+
[
|
50 |
+
("system", system_prompt),
|
51 |
+
("human", "{input}") # dữ liệu vào
|
52 |
+
]
|
53 |
+
)
|
54 |
+
|
55 |
+
|
56 |
+
|
57 |
+
question_answer_chain = create_stuff_documents_chain(llm,prompt)
|
58 |
+
|
59 |
+
def rerank_documents(query, docs, top_n=5):
|
60 |
+
pairs = [[query, doc.page_content] for doc in docs]
|
61 |
+
scores = cross_encoder.predict(pairs)
|
62 |
+
reranked = sorted(zip(docs, scores), key=lambda x: x[1], reverse=True)
|
63 |
+
return [doc for doc, _ in reranked[:top_n]]
|
64 |
+
|
65 |
+
|
66 |
+
|
67 |
+
|
68 |
+
|
69 |
+
@app.route('/chat_chatbot',methods=['POST'])
|
70 |
+
def chat_chatbot():
|
71 |
+
data = request.json
|
72 |
+
user_input = data.get("msg")
|
73 |
+
#Lấy top - k
|
74 |
+
similar_docs = retriever.invoke(user_input)
|
75 |
+
top_docs = rerank_documents(user_input, similar_docs, top_n=5)
|
76 |
+
response = question_answer_chain.invoke({
|
77 |
+
"input": user_input,
|
78 |
+
"context": top_docs
|
79 |
+
})
|
80 |
+
|
81 |
+
return jsonify({"answer": response})
|
82 |
+
|
83 |
+
@app.route("/train_new_files", methods=["POST"])
|
84 |
+
def train_api():
|
85 |
+
res = train_new_files()
|
86 |
+
return jsonify({"status": res})
|
87 |
+
|
88 |
+
|
89 |
+
@scheduler.task('interval', id='train_job',hours=6)
|
90 |
+
def scheduled_train():
|
91 |
+
print("Scheduled training started...")
|
92 |
+
train_new_files()
|
93 |
+
|
94 |
+
scheduler.start()
|
95 |
+
|
96 |
+
if __name__ == '__main__':
|
97 |
+
app.run(host="0.0.0.0", port=8080, debug=True)
|
98 |
+
|
99 |
+
|
configs.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
|
4 |
+
load_dotenv()
|
5 |
+
|
6 |
+
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
|
7 |
+
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
|
8 |
+
PINECONE_ENV = os.getenv("PINECONE_ENV", "us-east1-gcp")
|
9 |
+
MODEL_LLM_NAME ="deepseek/deepseek-r1-distill-llama-70b:free"
|
10 |
+
MODEL_EMBEDING_NAME ="dangvantuan/vietnamese-embedding"
|
11 |
+
MODEL_CROSS_ENCODER_NAME = "itdainb/PhoRanker"
|
12 |
+
INDEX_NAME = "medical-chatbot"
|
13 |
+
DATA_FOLDER = "./DataChatbot/"
|
14 |
+
TRAINED_LOG = "trained_files.log" #Nếu đã từng train thì sẽ lưu tên file lại vào đây
|
requirements.txt
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
sentence-transformers
|
2 |
+
pyvi
|
3 |
+
langchain
|
4 |
+
flask
|
5 |
+
flask-apscheduler
|
6 |
+
pypdf
|
7 |
+
python-dotenv
|
8 |
+
pinecone[grpc]
|
9 |
+
langchain-pinecone
|
10 |
+
langchain_community
|
11 |
+
langchain_openai
|
12 |
+
langchain_experimental
|
13 |
+
langchain-huggingface
|
14 |
+
|
15 |
+
-e .
|
research/trials.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
setup.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from setuptools import setup, find_packages
|
2 |
+
|
3 |
+
|
4 |
+
setup(
|
5 |
+
name='ChatbotMedical',
|
6 |
+
version='0.1',
|
7 |
+
author='Theanh13012004',
|
8 |
+
author_email='[email protected]',
|
9 |
+
description='A chatbot for medical news',
|
10 |
+
packages=find_packages(),
|
11 |
+
install_requires=[
|
12 |
+
],
|
13 |
+
|
14 |
+
)
|
src/__init__.py
ADDED
File without changes
|
src/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (161 Bytes). View file
|
|
src/__pycache__/__init__.cpython-313.pyc
ADDED
Binary file (165 Bytes). View file
|
|
src/__pycache__/helper.cpython-310.pyc
ADDED
Binary file (1.9 kB). View file
|
|
src/__pycache__/helper.cpython-313.pyc
ADDED
Binary file (2.93 kB). View file
|
|
src/__pycache__/prompt.cpython-310.pyc
ADDED
Binary file (514 Bytes). View file
|
|
src/__pycache__/prompt.cpython-313.pyc
ADDED
Binary file (675 Bytes). View file
|
|
src/helper.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.document_loaders import DirectoryLoader, UnstructuredWordDocumentLoader , TextLoader #Đã update
|
2 |
+
from langchain_huggingface import HuggingFaceEmbeddings # Đã update
|
3 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
4 |
+
import re
|
5 |
+
import os
|
6 |
+
|
7 |
+
|
8 |
+
|
9 |
+
#Load data
|
10 |
+
|
11 |
+
|
12 |
+
def load_word_files(data):
|
13 |
+
loader = DirectoryLoader(
|
14 |
+
path=data,
|
15 |
+
glob="*.txt",
|
16 |
+
loader_cls=lambda path: TextLoader(path, encoding='utf-8') #loại file cần load
|
17 |
+
)
|
18 |
+
documents = loader.load()
|
19 |
+
return documents
|
20 |
+
|
21 |
+
|
22 |
+
|
23 |
+
#Preprocess data
|
24 |
+
def preprocess_data(text):
|
25 |
+
|
26 |
+
|
27 |
+
#Xóa URL
|
28 |
+
text = re.sub(r'(https?://\S+|www\.\S+)', '', text)
|
29 |
+
# Xoá các dòng chỉ chứa dấu = hoặc -
|
30 |
+
text = re.sub(r'^[=\-]{2,}\s*$', '', text, flags=re.MULTILINE)
|
31 |
+
#Xoá các ký tự bảng markdown (|, ---)
|
32 |
+
text = re.sub(r'\|.*?\|', '', text)
|
33 |
+
# Xoá emoji và ký tự Unicode không cần thiết
|
34 |
+
text = re.sub(r'[^\w\s,.!?à-ỹÀ-Ỹ\-–]', '', text)
|
35 |
+
# Xoá khoảng trắng thừa và dòng trống
|
36 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
37 |
+
return text
|
38 |
+
|
39 |
+
|
40 |
+
#Tách thành các chunk
|
41 |
+
def text_split(cleaned_data):
|
42 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=20)
|
43 |
+
text_chunks = text_splitter.split_documents(cleaned_data)
|
44 |
+
return text_chunks
|
45 |
+
|
46 |
+
|
47 |
+
def download_hugging_face_embeddings():
|
48 |
+
embeddings = HuggingFaceEmbeddings(
|
49 |
+
model_name="dangvantuan/vietnamese-embedding"
|
50 |
+
)
|
51 |
+
return embeddings
|
52 |
+
|
53 |
+
|
54 |
+
|
55 |
+
|
56 |
+
#n8n
|
57 |
+
|
58 |
+
#Kiểm trả file train chưa
|
59 |
+
def is_file_trained(file_name,trained_files_log):
|
60 |
+
if not os.path.exists(trained_files_log): # Nếu file log chưa tồn tại
|
61 |
+
return False
|
62 |
+
with open(trained_files_log, 'r' ,encoding='utf-8') as f:
|
63 |
+
trained_files = f.read().splitlines() #lấy ds các tên file đã train
|
64 |
+
return file_name in trained_files #Xem file_name có trong đó không
|
65 |
+
|
66 |
+
#Đánh dấu file đã train
|
67 |
+
def mark_file_trained(file_name , trained_files_log):
|
68 |
+
with open(trained_files_log, 'a', encoding='utf-8') as f: # mở file append
|
69 |
+
f.write(f"{file_name}\n") #ghi tên file vào log khi đã train xong
|
src/prompt.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
system_prompt = (
|
2 |
+
"Bạn là một trợ lý để trả lời các câu hỏi y tế. "
|
3 |
+
"Hãy sử dụng các đoạn ngữ cảnh được truy xuất sau đây để trả lời câu hỏi."
|
4 |
+
"Nếu bạn không biết câu trả lời, hãy nói rằng bạn không biết."
|
5 |
+
"Dựa trên tất cả tài liệu liên quan (có thể đến từ nhiều nguồn), hãy tổng hợp thông tin một cách thống nhất. " #Để tránh tài liệu không nhất quán (bên nói bệnh lây , bênh không lây)
|
6 |
+
"Trả lời tối đa ba câu và giữ cho câu trả lời ngắn gọn."
|
7 |
+
"\n\n"
|
8 |
+
"{context}"
|
9 |
+
)
|
store_index.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.helper import *
|
2 |
+
from pinecone.grpc import PineconeGRPC as Pinecone
|
3 |
+
from langchain.schema import Document
|
4 |
+
from pinecone import ServerlessSpec
|
5 |
+
from langchain_pinecone import PineconeVectorStore
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
import os
|
8 |
+
from configs import *
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
#Tải model embeddings vietnamess
|
13 |
+
embeddings = download_hugging_face_embeddings()
|
14 |
+
|
15 |
+
pc = Pinecone(api_key=PINECONE_API_KEY)
|
16 |
+
|
17 |
+
|
18 |
+
|
19 |
+
def train_new_files():
|
20 |
+
#Gọi để gán dữ liệu
|
21 |
+
all_docs = load_word_files(data=DATA_FOLDER)
|
22 |
+
new_docs = [] #dữ liệu sẽ đc check
|
23 |
+
|
24 |
+
#Tiền xử lý dữ liệu
|
25 |
+
for doc in all_docs:
|
26 |
+
file_name = doc.metadata.get("source", "unknown.docx") #nếu metadata không có thì sẽ tên là unknown.docx -> langchain tự gán (tại vì sẽ không biết tên file sắp train)
|
27 |
+
if not is_file_trained(file_name, TRAINED_LOG): #chưa đc train
|
28 |
+
print(f"Phát hiện có file mới và training: {file_name}")
|
29 |
+
cleaned_content = preprocess_data(doc.page_content) #tiền xử lý dữ liệu
|
30 |
+
cleaned_doc = Document(
|
31 |
+
page_content=cleaned_content,
|
32 |
+
metadata=doc.metadata
|
33 |
+
)
|
34 |
+
new_docs.append(cleaned_doc)
|
35 |
+
#đánh dấu đã train
|
36 |
+
mark_file_trained(file_name, TRAINED_LOG)
|
37 |
+
else:
|
38 |
+
print(f"File đã được train trước đó: {file_name}")
|
39 |
+
if not new_docs:
|
40 |
+
return "Không có file mới nào để train"
|
41 |
+
|
42 |
+
#Tạo chunk
|
43 |
+
text_chunks = text_split(new_docs)
|
44 |
+
#Tạo db
|
45 |
+
if INDEX_NAME not in pc.list_indexes():
|
46 |
+
pc.create_index(
|
47 |
+
name=INDEX_NAME,
|
48 |
+
dimension=768,
|
49 |
+
metric="cosine",
|
50 |
+
spec=ServerlessSpec(cloud="aws", region="us-east-1")
|
51 |
+
)
|
52 |
+
PineconeVectorStore.from_documents(
|
53 |
+
documents=text_chunks,
|
54 |
+
index_name=INDEX_NAME,
|
55 |
+
embedding=embeddings
|
56 |
+
)
|
57 |
+
|
58 |
+
|
59 |
+
#Chỉ chạy 1 lần đầu tạo db
|
60 |
+
|
61 |
+
if __name__ == "__main__":
|
62 |
+
result = train_new_files()
|
63 |
+
print(result)
|
64 |
+
|
65 |
+
|
66 |
+
|
67 |
+
|
68 |
+
|
69 |
+
|
70 |
+
|
71 |
+
|
72 |
+
|
73 |
+
|
74 |
+
|
75 |
+
|
76 |
+
|
77 |
+
|
78 |
+
|
79 |
+
#Chạy store_index.py để tạo db cho lần đầu
|
template.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from pathlib import Path #đường dẫn với mọi hđh
|
3 |
+
import logging
|
4 |
+
|
5 |
+
|
6 |
+
|
7 |
+
logging.basicConfig(level=logging.INFO, format='[%(asctime)s]: %(message)s:')
|
8 |
+
|
9 |
+
list_dir = [
|
10 |
+
"src/__init__.py",
|
11 |
+
"src/helper.py",
|
12 |
+
"src/prompt.py",
|
13 |
+
".env",
|
14 |
+
"setup.py",
|
15 |
+
"store_index.py",
|
16 |
+
"app.py",
|
17 |
+
"research/trials.ipynb",
|
18 |
+
|
19 |
+
]
|
20 |
+
|
21 |
+
|
22 |
+
for filepath in list_dir:
|
23 |
+
filepath = Path(filepath) #Path sẽ đưa đc đg dẫn cho windows , linux ,mac... -> kiểu đường dẫn tuyệt đối
|
24 |
+
filedir , filename = os.path.split(filepath)
|
25 |
+
|
26 |
+
if filedir !="":
|
27 |
+
os.makedirs(filedir,exist_ok=True)
|
28 |
+
logging.info(f"Đang tạo thư mục:{filedir} với các file: {filename}")
|
29 |
+
|
30 |
+
if (not os.path.exists(filepath) or (os.path.getsize(filepath)==0)):
|
31 |
+
with open(filepath , "w") as f:
|
32 |
+
pass
|
33 |
+
logging.info(f'Đang tạo file {filename}')
|
34 |
+
|
35 |
+
else:
|
36 |
+
logging.info(f"{filename} đã tồn tại" )
|
trained_files.log
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
DataChatbot\Data_baiviet_benh_vinmec_2025-07-22.txt
|