Perrine commited on
Commit
b18c318
1 Parent(s): b7d87ea

Upload 18 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/2307_Pac-coup-doeil_BRO_BD.pdf filter=lfs diff=lfs merge=lfs -text
37
+ data/vector_store/index.faiss filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+
3
+ WORKDIR /home/usr/
4
+
5
+ COPY ./requirements.txt /home/usr/requirements.txt
6
+
7
+
8
+ RUN pip install --no-cache-dir --upgrade -r /home/usr/requirements.txt
9
+ RUN mkdir .files/
10
+
11
+ COPY . .
12
+ COPY .chainlit .chainlit
13
+
14
+
15
+ CMD ["chainlit", "run", "/home/usr/rag_app.py", "--host", "0.0.0.0", "--port", "7860"]
16
+ # CMD ["ls", "-a"]
17
+ # RUN chmod 755 /usr/local/lib/python3.10/
chainlit.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Welcome to Chainlit! 🚀🤖
2
+
3
+ Hi there, Developer! 👋 We're excited to have you on board. Chainlit is a powerful tool designed to help you prototype, debug and share applications built on top of LLMs.
4
+
5
+ ## Useful Links 🔗
6
+
7
+ - **Documentation:** Get started with our comprehensive [Chainlit Documentation](https://docs.chainlit.io) 📚
8
+ - **Discord Community:** Join our friendly [Chainlit Discord](https://discord.gg/k73SQ3FyUh) to ask questions, share your projects, and connect with other developers! 💬
9
+
10
+ We can't wait to see what you create with Chainlit! Happy coding! 💻😊
11
+
12
+ ## Welcome screen
13
+
14
+ To modify the welcome screen, edit the `chainlit.md` file at the root of your project. If you do not want a welcome screen, just leave this file empty.
data/2307_Pac-coup-doeil_BRO_BD.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a19e16f6788c332870968a0588fb911344454368e2cba765ccd822fa322edde3
3
+ size 2431848
data/AB-Corse-2024_notice.pdf ADDED
Binary file (280 kB). View file
 
data/AB-Hexagone-2024_notice.pdf ADDED
Binary file (276 kB). View file
 
data/AC-2024_notice.pdf ADDED
Binary file (271 kB). View file
 
data/ADMCA-2024_notice.pdf ADDED
Binary file (279 kB). View file
 
data/AO-2024_notice.pdf ADDED
Binary file (289 kB). View file
 
data/APR-2024_notice.pdf ADDED
Binary file (281 kB). View file
 
data/Conditionnalite-2023_fiche-technique_presentation-generale.pdf ADDED
Binary file (721 kB). View file
 
data/PAB-2024_notice.pdf ADDED
Binary file (373 kB). View file
 
data/PPR-2024_notice.pdf ADDED
Binary file (260 kB). View file
 
data/VSLM-2024_notice.pdf ADDED
Binary file (207 kB). View file
 
data/vector_store/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f84a34da5d2ce56a8ec5da735f30c382dd7df98ee7c2ba4dc19b8f30073ab01
3
+ size 2347053
data/vector_store/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:604e1c8d73553c5dc4edaab0359a3cf8bf92405da4f446a77d153f64b4b9ce98
3
+ size 561617
rag_app.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import json
3
+ import chainlit as cl
4
+
5
+ from langchain_community.vectorstores import FAISS
6
+ from rag_module import RagModule
7
+
8
+ from collections import defaultdict
9
+
10
+
11
+
12
+ prompt_template = """
13
+ Tu t'appelles GAIA et tu travailles pour Politique Agricole Commune (PAC). Tu es un agent intelligent spécialisé sur les aides financières agricoles.
14
+ Tu es chargé de donner des conseils sur les aides financières disponibles pour les agriculteurs.
15
+ Tu comprends et génère les réponses en français, jamais en anglais.
16
+ Merci de bien vouloir répondre aux questions en utilisant seulement le contexte suivant.
17
+ contexte: {context}
18
+
19
+ historique: {history}
20
+
21
+ question: {question}
22
+ réponse:
23
+ """
24
+
25
+
26
+
27
+ ##------------ CHAINLIT ---------------##
28
+ @cl.on_chat_start
29
+ async def start():
30
+ rag = RagModule()
31
+
32
+ db = rag.get_faiss_db()
33
+
34
+ qa_chain = rag.retrieval_qa_memory_chain(db, prompt_template)
35
+
36
+ msg = cl.Message(content="Lancement du bot...", author = "Gaia")
37
+ await msg.send()
38
+ msg.content = "Bonjour et bienvenue sur le bot qui connait tout sur toutes les aides de la PAC (ou à peu près). Quelle est ta question ?"
39
+ await msg.update()
40
+ cl.user_session.set("chain", qa_chain)
41
+
42
+ @cl.on_message
43
+ async def main(message):
44
+ rag = RagModule()
45
+
46
+ chain = cl.user_session.get("chain")
47
+
48
+ cb = cl.AsyncLangchainCallbackHandler(
49
+ stream_final_answer = True,
50
+ answer_prefix_tokens=["FINAL", "ANSWER"]
51
+ )
52
+
53
+ cb.answer_reached=True
54
+ response = await chain.ainvoke(message.content, callbacks=[cb])
55
+
56
+ answer = response.get('result')
57
+ sources = rag.get_sources_document(response.get('source_documents'))
58
+
59
+ elements = [cl.Pdf(name = "Pdf", display ="inline", path = path) for path in sources]
60
+
61
+
62
+ if response.get('source_documents'):
63
+ answer = rag.shape_answer_with_source(answer, sources)
64
+ else:
65
+ answer += f"\nNo sources found"
66
+
67
+ await cl.Message(content=answer, elements=elements, author="Gaia").send()
68
+ # await cl.Message(content=answer, author = "Chatbot Eureden").send()
69
+
70
+
rag_module.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ #load & split data
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ # embed data
5
+ from langchain_mistralai.embeddings import MistralAIEmbeddings
6
+ # vector store
7
+ from langchain_community.vectorstores import FAISS
8
+ # prompt
9
+ from langchain.prompts import PromptTemplate
10
+ # memory
11
+ from langchain.memory import ConversationBufferMemory
12
+ #llm
13
+ from langchain_mistralai.chat_models import ChatMistralAI
14
+
15
+ #chain modules
16
+ from langchain.chains import RetrievalQA
17
+
18
+
19
+
20
+ # import PyPDF2
21
+ import os
22
+ import re
23
+ from dotenv import load_dotenv
24
+ load_dotenv()
25
+ from collections import defaultdict
26
+
27
+
28
+ class RagModule():
29
+ def __init__(self):
30
+ self.mistral_api_key = "Yb2kAF0DR4Mva5AEmoYFV3kYRAKdXB7i"
31
+ self.model_name_embedding = "mistral-embed"
32
+ self.embedding_model = MistralAIEmbeddings(model=self.model_name_embedding, mistral_api_key=self.mistral_api_key)
33
+ self.chunk_size = 1000
34
+ self.chunk_overlap = 120
35
+ self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
36
+ self.db_faiss_path = "data/vector_store"
37
+ #params llm
38
+ self.llm_model = "mistral-small"
39
+ self.max_new_tokens = 512
40
+ self.top_p = 0.5
41
+ self.temperature = 0.1
42
+
43
+
44
+ def read_pdf_file(self, file):
45
+ """read pdf file
46
+
47
+ Args:
48
+ file (_type_): _description_
49
+
50
+ Returns:
51
+ _type_: _description_
52
+ """
53
+ pdf = PyPDF2.PdfReader(file.path)
54
+ pdf_text = ""
55
+ for page in pdf.pages:
56
+ pdf_text += page.extract_text()
57
+
58
+ return pdf_text
59
+
60
+ def split_text(self, text:str) -> list:
61
+ """Split the text into chunk
62
+
63
+ Args:
64
+ text (str): _description_
65
+
66
+ Returns:
67
+ list: _description_
68
+ """
69
+ texts = self.text_splitter.split_text(text)
70
+ return texts
71
+
72
+ def get_metadata(self, texts:list) -> list:
73
+ """_summary_
74
+
75
+ Args:
76
+ texts (list): _description_
77
+
78
+ Returns:
79
+ list: _description_
80
+ """
81
+ metadatas = [{"source": f'Paragraphe: {i}'} for i in range(len(texts))]
82
+ return metadatas
83
+
84
+ def get_faiss_db(self):
85
+ """load local faiss vector store containing all embeddings
86
+
87
+ """
88
+ db = FAISS.load_local(self.db_faiss_path, self.embedding_model)
89
+ return db
90
+
91
+ def set_custom_prompt(self, prompt_template:str):
92
+ """Instantiate prompt template for Q&A retreival for each vectore stores
93
+
94
+ Args:
95
+ prompt_template (str): description of the prompt
96
+ input_variables (list): variables in the prompt
97
+ """
98
+ prompt = PromptTemplate.from_template(
99
+ template=prompt_template,
100
+ )
101
+
102
+ return prompt
103
+
104
+ def load_mistral(self):
105
+ """instantiate LLM
106
+ """
107
+
108
+ model_kwargs = {
109
+ "mistral_api_key": self.mistral_api_key,
110
+ "model": self.llm_model,
111
+ "max_new_tokens": self.max_new_tokens,
112
+ "top_p": self.top_p,
113
+ "temperature": self.temperature,
114
+ }
115
+
116
+ llm = ChatMistralAI(**model_kwargs)
117
+
118
+ return llm
119
+
120
+ def retrieval_qa_memory_chain(self, db, prompt_template):
121
+ """_summary_
122
+ """
123
+ llm = self.load_mistral()
124
+ prompt = self.set_custom_prompt(prompt_template)
125
+ memory = ConversationBufferMemory(
126
+ memory_key = 'history',
127
+ input_key = 'question'
128
+ )
129
+ chain_type_kwargs= {
130
+ "prompt" : prompt,
131
+ "memory" : memory
132
+ }
133
+
134
+ qa_chain = RetrievalQA.from_chain_type(
135
+ llm = llm,
136
+ chain_type = 'stuff',
137
+ retriever = db.as_retriever(search_kwargs={"k":5}),
138
+ chain_type_kwargs = chain_type_kwargs,
139
+ return_source_documents = True,
140
+ )
141
+
142
+ return qa_chain
143
+
144
+ def retrieval_qa_chain(self, db, prompt_template):
145
+ """_summary_
146
+ """
147
+ llm = self.load_llm()
148
+ prompt = self.set_custom_prompt(prompt_template)
149
+
150
+ chain_type_kwargs= {
151
+ "prompt" : prompt,
152
+ }
153
+
154
+ qa_chain = RetrievalQA.from_chain_type(
155
+ llm = llm,
156
+ chain_type = 'stuff',
157
+ retriever = db.as_retriever(search_kwargs={"k":3}),
158
+ chain_type_kwargs = chain_type_kwargs,
159
+ return_source_documents = True,
160
+ )
161
+
162
+ return qa_chain
163
+
164
+
165
+
166
+ def get_sources_document(self, source_documents:list) -> dict:
167
+ """generate dictionnary with path (as a key) and list of pages associated to one path
168
+
169
+ Args:
170
+ source_document (list): list of documents containing source_document of rag response
171
+
172
+ Returns:
173
+ dict: {
174
+ path/to/file1 : [0, 1, 3],
175
+ path/to/file2 : [5, 2]
176
+ }
177
+ """
178
+ sources = defaultdict(list)
179
+ for doc in source_documents:
180
+ sources[doc.metadata["source"]].append(doc.metadata["page"])
181
+
182
+ return sources
183
+
184
+ def shape_answer_with_source(self, answer: str, sources: dict):
185
+ """_summary_
186
+
187
+ Args:
188
+ answer (str): _description_
189
+ source (dict): _description_
190
+ """
191
+ pattern = r"^(.+)\/([^\/]+)$"
192
+
193
+ source_msg = ""
194
+ for path, page in sources.items():
195
+ file = re.findall(pattern, path)[0][1]
196
+ source_msg += f"\nFichier: {file} - Page: {page}"
197
+
198
+ answer += f"\n{source_msg}"
199
+
200
+ return answer
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ langchain==0.1.7
2
+ langchain-core==0.1.23
3
+ langchain-mistralai==0.0.4
4
+ langchain-community==0.0.20
5
+ faiss-cpu==1.7.4
6
+ python-dotenv==1.0.1
7
+ chainlit
8
+ openai
9
+ pypdf==4.0.2