Spaces:
Sleeping
Sleeping
Upload 18 files
Browse files- .gitattributes +2 -0
- Dockerfile +17 -0
- chainlit.md +14 -0
- data/2307_Pac-coup-doeil_BRO_BD.pdf +3 -0
- data/AB-Corse-2024_notice.pdf +0 -0
- data/AB-Hexagone-2024_notice.pdf +0 -0
- data/AC-2024_notice.pdf +0 -0
- data/ADMCA-2024_notice.pdf +0 -0
- data/AO-2024_notice.pdf +0 -0
- data/APR-2024_notice.pdf +0 -0
- data/Conditionnalite-2023_fiche-technique_presentation-generale.pdf +0 -0
- data/PAB-2024_notice.pdf +0 -0
- data/PPR-2024_notice.pdf +0 -0
- data/VSLM-2024_notice.pdf +0 -0
- data/vector_store/index.faiss +3 -0
- data/vector_store/index.pkl +3 -0
- rag_app.py +70 -0
- rag_module.py +200 -0
- requirements.txt +9 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
data/2307_Pac-coup-doeil_BRO_BD.pdf filter=lfs diff=lfs merge=lfs -text
|
37 |
+
data/vector_store/index.faiss filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.10
|
2 |
+
|
3 |
+
WORKDIR /home/usr/
|
4 |
+
|
5 |
+
COPY ./requirements.txt /home/usr/requirements.txt
|
6 |
+
|
7 |
+
|
8 |
+
RUN pip install --no-cache-dir --upgrade -r /home/usr/requirements.txt
|
9 |
+
RUN mkdir .files/
|
10 |
+
|
11 |
+
COPY . .
|
12 |
+
COPY .chainlit .chainlit
|
13 |
+
|
14 |
+
|
15 |
+
CMD ["chainlit", "run", "/home/usr/rag_app.py", "--host", "0.0.0.0", "--port", "7860"]
|
16 |
+
# CMD ["ls", "-a"]
|
17 |
+
# RUN chmod 755 /usr/local/lib/python3.10/
|
chainlit.md
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Welcome to Chainlit! 🚀🤖
|
2 |
+
|
3 |
+
Hi there, Developer! 👋 We're excited to have you on board. Chainlit is a powerful tool designed to help you prototype, debug and share applications built on top of LLMs.
|
4 |
+
|
5 |
+
## Useful Links 🔗
|
6 |
+
|
7 |
+
- **Documentation:** Get started with our comprehensive [Chainlit Documentation](https://docs.chainlit.io) 📚
|
8 |
+
- **Discord Community:** Join our friendly [Chainlit Discord](https://discord.gg/k73SQ3FyUh) to ask questions, share your projects, and connect with other developers! 💬
|
9 |
+
|
10 |
+
We can't wait to see what you create with Chainlit! Happy coding! 💻😊
|
11 |
+
|
12 |
+
## Welcome screen
|
13 |
+
|
14 |
+
To modify the welcome screen, edit the `chainlit.md` file at the root of your project. If you do not want a welcome screen, just leave this file empty.
|
data/2307_Pac-coup-doeil_BRO_BD.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a19e16f6788c332870968a0588fb911344454368e2cba765ccd822fa322edde3
|
3 |
+
size 2431848
|
data/AB-Corse-2024_notice.pdf
ADDED
Binary file (280 kB). View file
|
|
data/AB-Hexagone-2024_notice.pdf
ADDED
Binary file (276 kB). View file
|
|
data/AC-2024_notice.pdf
ADDED
Binary file (271 kB). View file
|
|
data/ADMCA-2024_notice.pdf
ADDED
Binary file (279 kB). View file
|
|
data/AO-2024_notice.pdf
ADDED
Binary file (289 kB). View file
|
|
data/APR-2024_notice.pdf
ADDED
Binary file (281 kB). View file
|
|
data/Conditionnalite-2023_fiche-technique_presentation-generale.pdf
ADDED
Binary file (721 kB). View file
|
|
data/PAB-2024_notice.pdf
ADDED
Binary file (373 kB). View file
|
|
data/PPR-2024_notice.pdf
ADDED
Binary file (260 kB). View file
|
|
data/VSLM-2024_notice.pdf
ADDED
Binary file (207 kB). View file
|
|
data/vector_store/index.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4f84a34da5d2ce56a8ec5da735f30c382dd7df98ee7c2ba4dc19b8f30073ab01
|
3 |
+
size 2347053
|
data/vector_store/index.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:604e1c8d73553c5dc4edaab0359a3cf8bf92405da4f446a77d153f64b4b9ce98
|
3 |
+
size 561617
|
rag_app.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import json
|
3 |
+
import chainlit as cl
|
4 |
+
|
5 |
+
from langchain_community.vectorstores import FAISS
|
6 |
+
from rag_module import RagModule
|
7 |
+
|
8 |
+
from collections import defaultdict
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
prompt_template = """
|
13 |
+
Tu t'appelles GAIA et tu travailles pour Politique Agricole Commune (PAC). Tu es un agent intelligent spécialisé sur les aides financières agricoles.
|
14 |
+
Tu es chargé de donner des conseils sur les aides financières disponibles pour les agriculteurs.
|
15 |
+
Tu comprends et génère les réponses en français, jamais en anglais.
|
16 |
+
Merci de bien vouloir répondre aux questions en utilisant seulement le contexte suivant.
|
17 |
+
contexte: {context}
|
18 |
+
|
19 |
+
historique: {history}
|
20 |
+
|
21 |
+
question: {question}
|
22 |
+
réponse:
|
23 |
+
"""
|
24 |
+
|
25 |
+
|
26 |
+
|
27 |
+
##------------ CHAINLIT ---------------##
|
28 |
+
@cl.on_chat_start
|
29 |
+
async def start():
|
30 |
+
rag = RagModule()
|
31 |
+
|
32 |
+
db = rag.get_faiss_db()
|
33 |
+
|
34 |
+
qa_chain = rag.retrieval_qa_memory_chain(db, prompt_template)
|
35 |
+
|
36 |
+
msg = cl.Message(content="Lancement du bot...", author = "Gaia")
|
37 |
+
await msg.send()
|
38 |
+
msg.content = "Bonjour et bienvenue sur le bot qui connait tout sur toutes les aides de la PAC (ou à peu près). Quelle est ta question ?"
|
39 |
+
await msg.update()
|
40 |
+
cl.user_session.set("chain", qa_chain)
|
41 |
+
|
42 |
+
@cl.on_message
|
43 |
+
async def main(message):
|
44 |
+
rag = RagModule()
|
45 |
+
|
46 |
+
chain = cl.user_session.get("chain")
|
47 |
+
|
48 |
+
cb = cl.AsyncLangchainCallbackHandler(
|
49 |
+
stream_final_answer = True,
|
50 |
+
answer_prefix_tokens=["FINAL", "ANSWER"]
|
51 |
+
)
|
52 |
+
|
53 |
+
cb.answer_reached=True
|
54 |
+
response = await chain.ainvoke(message.content, callbacks=[cb])
|
55 |
+
|
56 |
+
answer = response.get('result')
|
57 |
+
sources = rag.get_sources_document(response.get('source_documents'))
|
58 |
+
|
59 |
+
elements = [cl.Pdf(name = "Pdf", display ="inline", path = path) for path in sources]
|
60 |
+
|
61 |
+
|
62 |
+
if response.get('source_documents'):
|
63 |
+
answer = rag.shape_answer_with_source(answer, sources)
|
64 |
+
else:
|
65 |
+
answer += f"\nNo sources found"
|
66 |
+
|
67 |
+
await cl.Message(content=answer, elements=elements, author="Gaia").send()
|
68 |
+
# await cl.Message(content=answer, author = "Chatbot Eureden").send()
|
69 |
+
|
70 |
+
|
rag_module.py
ADDED
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
#load & split data
|
3 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
4 |
+
# embed data
|
5 |
+
from langchain_mistralai.embeddings import MistralAIEmbeddings
|
6 |
+
# vector store
|
7 |
+
from langchain_community.vectorstores import FAISS
|
8 |
+
# prompt
|
9 |
+
from langchain.prompts import PromptTemplate
|
10 |
+
# memory
|
11 |
+
from langchain.memory import ConversationBufferMemory
|
12 |
+
#llm
|
13 |
+
from langchain_mistralai.chat_models import ChatMistralAI
|
14 |
+
|
15 |
+
#chain modules
|
16 |
+
from langchain.chains import RetrievalQA
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
# import PyPDF2
|
21 |
+
import os
|
22 |
+
import re
|
23 |
+
from dotenv import load_dotenv
|
24 |
+
load_dotenv()
|
25 |
+
from collections import defaultdict
|
26 |
+
|
27 |
+
|
28 |
+
class RagModule():
|
29 |
+
def __init__(self):
|
30 |
+
self.mistral_api_key = "Yb2kAF0DR4Mva5AEmoYFV3kYRAKdXB7i"
|
31 |
+
self.model_name_embedding = "mistral-embed"
|
32 |
+
self.embedding_model = MistralAIEmbeddings(model=self.model_name_embedding, mistral_api_key=self.mistral_api_key)
|
33 |
+
self.chunk_size = 1000
|
34 |
+
self.chunk_overlap = 120
|
35 |
+
self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
|
36 |
+
self.db_faiss_path = "data/vector_store"
|
37 |
+
#params llm
|
38 |
+
self.llm_model = "mistral-small"
|
39 |
+
self.max_new_tokens = 512
|
40 |
+
self.top_p = 0.5
|
41 |
+
self.temperature = 0.1
|
42 |
+
|
43 |
+
|
44 |
+
def read_pdf_file(self, file):
|
45 |
+
"""read pdf file
|
46 |
+
|
47 |
+
Args:
|
48 |
+
file (_type_): _description_
|
49 |
+
|
50 |
+
Returns:
|
51 |
+
_type_: _description_
|
52 |
+
"""
|
53 |
+
pdf = PyPDF2.PdfReader(file.path)
|
54 |
+
pdf_text = ""
|
55 |
+
for page in pdf.pages:
|
56 |
+
pdf_text += page.extract_text()
|
57 |
+
|
58 |
+
return pdf_text
|
59 |
+
|
60 |
+
def split_text(self, text:str) -> list:
|
61 |
+
"""Split the text into chunk
|
62 |
+
|
63 |
+
Args:
|
64 |
+
text (str): _description_
|
65 |
+
|
66 |
+
Returns:
|
67 |
+
list: _description_
|
68 |
+
"""
|
69 |
+
texts = self.text_splitter.split_text(text)
|
70 |
+
return texts
|
71 |
+
|
72 |
+
def get_metadata(self, texts:list) -> list:
|
73 |
+
"""_summary_
|
74 |
+
|
75 |
+
Args:
|
76 |
+
texts (list): _description_
|
77 |
+
|
78 |
+
Returns:
|
79 |
+
list: _description_
|
80 |
+
"""
|
81 |
+
metadatas = [{"source": f'Paragraphe: {i}'} for i in range(len(texts))]
|
82 |
+
return metadatas
|
83 |
+
|
84 |
+
def get_faiss_db(self):
|
85 |
+
"""load local faiss vector store containing all embeddings
|
86 |
+
|
87 |
+
"""
|
88 |
+
db = FAISS.load_local(self.db_faiss_path, self.embedding_model)
|
89 |
+
return db
|
90 |
+
|
91 |
+
def set_custom_prompt(self, prompt_template:str):
|
92 |
+
"""Instantiate prompt template for Q&A retreival for each vectore stores
|
93 |
+
|
94 |
+
Args:
|
95 |
+
prompt_template (str): description of the prompt
|
96 |
+
input_variables (list): variables in the prompt
|
97 |
+
"""
|
98 |
+
prompt = PromptTemplate.from_template(
|
99 |
+
template=prompt_template,
|
100 |
+
)
|
101 |
+
|
102 |
+
return prompt
|
103 |
+
|
104 |
+
def load_mistral(self):
|
105 |
+
"""instantiate LLM
|
106 |
+
"""
|
107 |
+
|
108 |
+
model_kwargs = {
|
109 |
+
"mistral_api_key": self.mistral_api_key,
|
110 |
+
"model": self.llm_model,
|
111 |
+
"max_new_tokens": self.max_new_tokens,
|
112 |
+
"top_p": self.top_p,
|
113 |
+
"temperature": self.temperature,
|
114 |
+
}
|
115 |
+
|
116 |
+
llm = ChatMistralAI(**model_kwargs)
|
117 |
+
|
118 |
+
return llm
|
119 |
+
|
120 |
+
def retrieval_qa_memory_chain(self, db, prompt_template):
|
121 |
+
"""_summary_
|
122 |
+
"""
|
123 |
+
llm = self.load_mistral()
|
124 |
+
prompt = self.set_custom_prompt(prompt_template)
|
125 |
+
memory = ConversationBufferMemory(
|
126 |
+
memory_key = 'history',
|
127 |
+
input_key = 'question'
|
128 |
+
)
|
129 |
+
chain_type_kwargs= {
|
130 |
+
"prompt" : prompt,
|
131 |
+
"memory" : memory
|
132 |
+
}
|
133 |
+
|
134 |
+
qa_chain = RetrievalQA.from_chain_type(
|
135 |
+
llm = llm,
|
136 |
+
chain_type = 'stuff',
|
137 |
+
retriever = db.as_retriever(search_kwargs={"k":5}),
|
138 |
+
chain_type_kwargs = chain_type_kwargs,
|
139 |
+
return_source_documents = True,
|
140 |
+
)
|
141 |
+
|
142 |
+
return qa_chain
|
143 |
+
|
144 |
+
def retrieval_qa_chain(self, db, prompt_template):
|
145 |
+
"""_summary_
|
146 |
+
"""
|
147 |
+
llm = self.load_llm()
|
148 |
+
prompt = self.set_custom_prompt(prompt_template)
|
149 |
+
|
150 |
+
chain_type_kwargs= {
|
151 |
+
"prompt" : prompt,
|
152 |
+
}
|
153 |
+
|
154 |
+
qa_chain = RetrievalQA.from_chain_type(
|
155 |
+
llm = llm,
|
156 |
+
chain_type = 'stuff',
|
157 |
+
retriever = db.as_retriever(search_kwargs={"k":3}),
|
158 |
+
chain_type_kwargs = chain_type_kwargs,
|
159 |
+
return_source_documents = True,
|
160 |
+
)
|
161 |
+
|
162 |
+
return qa_chain
|
163 |
+
|
164 |
+
|
165 |
+
|
166 |
+
def get_sources_document(self, source_documents:list) -> dict:
|
167 |
+
"""generate dictionnary with path (as a key) and list of pages associated to one path
|
168 |
+
|
169 |
+
Args:
|
170 |
+
source_document (list): list of documents containing source_document of rag response
|
171 |
+
|
172 |
+
Returns:
|
173 |
+
dict: {
|
174 |
+
path/to/file1 : [0, 1, 3],
|
175 |
+
path/to/file2 : [5, 2]
|
176 |
+
}
|
177 |
+
"""
|
178 |
+
sources = defaultdict(list)
|
179 |
+
for doc in source_documents:
|
180 |
+
sources[doc.metadata["source"]].append(doc.metadata["page"])
|
181 |
+
|
182 |
+
return sources
|
183 |
+
|
184 |
+
def shape_answer_with_source(self, answer: str, sources: dict):
|
185 |
+
"""_summary_
|
186 |
+
|
187 |
+
Args:
|
188 |
+
answer (str): _description_
|
189 |
+
source (dict): _description_
|
190 |
+
"""
|
191 |
+
pattern = r"^(.+)\/([^\/]+)$"
|
192 |
+
|
193 |
+
source_msg = ""
|
194 |
+
for path, page in sources.items():
|
195 |
+
file = re.findall(pattern, path)[0][1]
|
196 |
+
source_msg += f"\nFichier: {file} - Page: {page}"
|
197 |
+
|
198 |
+
answer += f"\n{source_msg}"
|
199 |
+
|
200 |
+
return answer
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain==0.1.7
|
2 |
+
langchain-core==0.1.23
|
3 |
+
langchain-mistralai==0.0.4
|
4 |
+
langchain-community==0.0.20
|
5 |
+
faiss-cpu==1.7.4
|
6 |
+
python-dotenv==1.0.1
|
7 |
+
chainlit
|
8 |
+
openai
|
9 |
+
pypdf==4.0.2
|