Spaces:
Sleeping
Sleeping
Upload 18 files
Browse files- .gitattributes +2 -0
- Dockerfile +17 -0
- chainlit.md +14 -0
- data/2307_Pac-coup-doeil_BRO_BD.pdf +3 -0
- data/AB-Corse-2024_notice.pdf +0 -0
- data/AB-Hexagone-2024_notice.pdf +0 -0
- data/AC-2024_notice.pdf +0 -0
- data/ADMCA-2024_notice.pdf +0 -0
- data/AO-2024_notice.pdf +0 -0
- data/APR-2024_notice.pdf +0 -0
- data/Conditionnalite-2023_fiche-technique_presentation-generale.pdf +0 -0
- data/PAB-2024_notice.pdf +0 -0
- data/PPR-2024_notice.pdf +0 -0
- data/VSLM-2024_notice.pdf +0 -0
- data/vector_store/index.faiss +3 -0
- data/vector_store/index.pkl +3 -0
- rag_app.py +70 -0
- rag_module.py +200 -0
- requirements.txt +9 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
data/2307_Pac-coup-doeil_BRO_BD.pdf filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
data/vector_store/index.faiss filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10
|
| 2 |
+
|
| 3 |
+
WORKDIR /home/usr/
|
| 4 |
+
|
| 5 |
+
COPY ./requirements.txt /home/usr/requirements.txt
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
RUN pip install --no-cache-dir --upgrade -r /home/usr/requirements.txt
|
| 9 |
+
RUN mkdir .files/
|
| 10 |
+
|
| 11 |
+
COPY . .
|
| 12 |
+
COPY .chainlit .chainlit
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
CMD ["chainlit", "run", "/home/usr/rag_app.py", "--host", "0.0.0.0", "--port", "7860"]
|
| 16 |
+
# CMD ["ls", "-a"]
|
| 17 |
+
# RUN chmod 755 /usr/local/lib/python3.10/
|
chainlit.md
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Welcome to Chainlit! 🚀🤖
|
| 2 |
+
|
| 3 |
+
Hi there, Developer! 👋 We're excited to have you on board. Chainlit is a powerful tool designed to help you prototype, debug and share applications built on top of LLMs.
|
| 4 |
+
|
| 5 |
+
## Useful Links 🔗
|
| 6 |
+
|
| 7 |
+
- **Documentation:** Get started with our comprehensive [Chainlit Documentation](https://docs.chainlit.io) 📚
|
| 8 |
+
- **Discord Community:** Join our friendly [Chainlit Discord](https://discord.gg/k73SQ3FyUh) to ask questions, share your projects, and connect with other developers! 💬
|
| 9 |
+
|
| 10 |
+
We can't wait to see what you create with Chainlit! Happy coding! 💻😊
|
| 11 |
+
|
| 12 |
+
## Welcome screen
|
| 13 |
+
|
| 14 |
+
To modify the welcome screen, edit the `chainlit.md` file at the root of your project. If you do not want a welcome screen, just leave this file empty.
|
data/2307_Pac-coup-doeil_BRO_BD.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a19e16f6788c332870968a0588fb911344454368e2cba765ccd822fa322edde3
|
| 3 |
+
size 2431848
|
data/AB-Corse-2024_notice.pdf
ADDED
|
Binary file (280 kB). View file
|
|
|
data/AB-Hexagone-2024_notice.pdf
ADDED
|
Binary file (276 kB). View file
|
|
|
data/AC-2024_notice.pdf
ADDED
|
Binary file (271 kB). View file
|
|
|
data/ADMCA-2024_notice.pdf
ADDED
|
Binary file (279 kB). View file
|
|
|
data/AO-2024_notice.pdf
ADDED
|
Binary file (289 kB). View file
|
|
|
data/APR-2024_notice.pdf
ADDED
|
Binary file (281 kB). View file
|
|
|
data/Conditionnalite-2023_fiche-technique_presentation-generale.pdf
ADDED
|
Binary file (721 kB). View file
|
|
|
data/PAB-2024_notice.pdf
ADDED
|
Binary file (373 kB). View file
|
|
|
data/PPR-2024_notice.pdf
ADDED
|
Binary file (260 kB). View file
|
|
|
data/VSLM-2024_notice.pdf
ADDED
|
Binary file (207 kB). View file
|
|
|
data/vector_store/index.faiss
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4f84a34da5d2ce56a8ec5da735f30c382dd7df98ee7c2ba4dc19b8f30073ab01
|
| 3 |
+
size 2347053
|
data/vector_store/index.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:604e1c8d73553c5dc4edaab0359a3cf8bf92405da4f446a77d153f64b4b9ce98
|
| 3 |
+
size 561617
|
rag_app.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import json
|
| 3 |
+
import chainlit as cl
|
| 4 |
+
|
| 5 |
+
from langchain_community.vectorstores import FAISS
|
| 6 |
+
from rag_module import RagModule
|
| 7 |
+
|
| 8 |
+
from collections import defaultdict
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
prompt_template = """
|
| 13 |
+
Tu t'appelles GAIA et tu travailles pour Politique Agricole Commune (PAC). Tu es un agent intelligent spécialisé sur les aides financières agricoles.
|
| 14 |
+
Tu es chargé de donner des conseils sur les aides financières disponibles pour les agriculteurs.
|
| 15 |
+
Tu comprends et génère les réponses en français, jamais en anglais.
|
| 16 |
+
Merci de bien vouloir répondre aux questions en utilisant seulement le contexte suivant.
|
| 17 |
+
contexte: {context}
|
| 18 |
+
|
| 19 |
+
historique: {history}
|
| 20 |
+
|
| 21 |
+
question: {question}
|
| 22 |
+
réponse:
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
##------------ CHAINLIT ---------------##
|
| 28 |
+
@cl.on_chat_start
|
| 29 |
+
async def start():
|
| 30 |
+
rag = RagModule()
|
| 31 |
+
|
| 32 |
+
db = rag.get_faiss_db()
|
| 33 |
+
|
| 34 |
+
qa_chain = rag.retrieval_qa_memory_chain(db, prompt_template)
|
| 35 |
+
|
| 36 |
+
msg = cl.Message(content="Lancement du bot...", author = "Gaia")
|
| 37 |
+
await msg.send()
|
| 38 |
+
msg.content = "Bonjour et bienvenue sur le bot qui connait tout sur toutes les aides de la PAC (ou à peu près). Quelle est ta question ?"
|
| 39 |
+
await msg.update()
|
| 40 |
+
cl.user_session.set("chain", qa_chain)
|
| 41 |
+
|
| 42 |
+
@cl.on_message
|
| 43 |
+
async def main(message):
|
| 44 |
+
rag = RagModule()
|
| 45 |
+
|
| 46 |
+
chain = cl.user_session.get("chain")
|
| 47 |
+
|
| 48 |
+
cb = cl.AsyncLangchainCallbackHandler(
|
| 49 |
+
stream_final_answer = True,
|
| 50 |
+
answer_prefix_tokens=["FINAL", "ANSWER"]
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
cb.answer_reached=True
|
| 54 |
+
response = await chain.ainvoke(message.content, callbacks=[cb])
|
| 55 |
+
|
| 56 |
+
answer = response.get('result')
|
| 57 |
+
sources = rag.get_sources_document(response.get('source_documents'))
|
| 58 |
+
|
| 59 |
+
elements = [cl.Pdf(name = "Pdf", display ="inline", path = path) for path in sources]
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
if response.get('source_documents'):
|
| 63 |
+
answer = rag.shape_answer_with_source(answer, sources)
|
| 64 |
+
else:
|
| 65 |
+
answer += f"\nNo sources found"
|
| 66 |
+
|
| 67 |
+
await cl.Message(content=answer, elements=elements, author="Gaia").send()
|
| 68 |
+
# await cl.Message(content=answer, author = "Chatbot Eureden").send()
|
| 69 |
+
|
| 70 |
+
|
rag_module.py
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
#load & split data
|
| 3 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 4 |
+
# embed data
|
| 5 |
+
from langchain_mistralai.embeddings import MistralAIEmbeddings
|
| 6 |
+
# vector store
|
| 7 |
+
from langchain_community.vectorstores import FAISS
|
| 8 |
+
# prompt
|
| 9 |
+
from langchain.prompts import PromptTemplate
|
| 10 |
+
# memory
|
| 11 |
+
from langchain.memory import ConversationBufferMemory
|
| 12 |
+
#llm
|
| 13 |
+
from langchain_mistralai.chat_models import ChatMistralAI
|
| 14 |
+
|
| 15 |
+
#chain modules
|
| 16 |
+
from langchain.chains import RetrievalQA
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# import PyPDF2
|
| 21 |
+
import os
|
| 22 |
+
import re
|
| 23 |
+
from dotenv import load_dotenv
|
| 24 |
+
load_dotenv()
|
| 25 |
+
from collections import defaultdict
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class RagModule():
|
| 29 |
+
def __init__(self):
|
| 30 |
+
self.mistral_api_key = "Yb2kAF0DR4Mva5AEmoYFV3kYRAKdXB7i"
|
| 31 |
+
self.model_name_embedding = "mistral-embed"
|
| 32 |
+
self.embedding_model = MistralAIEmbeddings(model=self.model_name_embedding, mistral_api_key=self.mistral_api_key)
|
| 33 |
+
self.chunk_size = 1000
|
| 34 |
+
self.chunk_overlap = 120
|
| 35 |
+
self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
|
| 36 |
+
self.db_faiss_path = "data/vector_store"
|
| 37 |
+
#params llm
|
| 38 |
+
self.llm_model = "mistral-small"
|
| 39 |
+
self.max_new_tokens = 512
|
| 40 |
+
self.top_p = 0.5
|
| 41 |
+
self.temperature = 0.1
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def read_pdf_file(self, file):
|
| 45 |
+
"""read pdf file
|
| 46 |
+
|
| 47 |
+
Args:
|
| 48 |
+
file (_type_): _description_
|
| 49 |
+
|
| 50 |
+
Returns:
|
| 51 |
+
_type_: _description_
|
| 52 |
+
"""
|
| 53 |
+
pdf = PyPDF2.PdfReader(file.path)
|
| 54 |
+
pdf_text = ""
|
| 55 |
+
for page in pdf.pages:
|
| 56 |
+
pdf_text += page.extract_text()
|
| 57 |
+
|
| 58 |
+
return pdf_text
|
| 59 |
+
|
| 60 |
+
def split_text(self, text:str) -> list:
|
| 61 |
+
"""Split the text into chunk
|
| 62 |
+
|
| 63 |
+
Args:
|
| 64 |
+
text (str): _description_
|
| 65 |
+
|
| 66 |
+
Returns:
|
| 67 |
+
list: _description_
|
| 68 |
+
"""
|
| 69 |
+
texts = self.text_splitter.split_text(text)
|
| 70 |
+
return texts
|
| 71 |
+
|
| 72 |
+
def get_metadata(self, texts:list) -> list:
|
| 73 |
+
"""_summary_
|
| 74 |
+
|
| 75 |
+
Args:
|
| 76 |
+
texts (list): _description_
|
| 77 |
+
|
| 78 |
+
Returns:
|
| 79 |
+
list: _description_
|
| 80 |
+
"""
|
| 81 |
+
metadatas = [{"source": f'Paragraphe: {i}'} for i in range(len(texts))]
|
| 82 |
+
return metadatas
|
| 83 |
+
|
| 84 |
+
def get_faiss_db(self):
|
| 85 |
+
"""load local faiss vector store containing all embeddings
|
| 86 |
+
|
| 87 |
+
"""
|
| 88 |
+
db = FAISS.load_local(self.db_faiss_path, self.embedding_model)
|
| 89 |
+
return db
|
| 90 |
+
|
| 91 |
+
def set_custom_prompt(self, prompt_template:str):
|
| 92 |
+
"""Instantiate prompt template for Q&A retreival for each vectore stores
|
| 93 |
+
|
| 94 |
+
Args:
|
| 95 |
+
prompt_template (str): description of the prompt
|
| 96 |
+
input_variables (list): variables in the prompt
|
| 97 |
+
"""
|
| 98 |
+
prompt = PromptTemplate.from_template(
|
| 99 |
+
template=prompt_template,
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
return prompt
|
| 103 |
+
|
| 104 |
+
def load_mistral(self):
|
| 105 |
+
"""instantiate LLM
|
| 106 |
+
"""
|
| 107 |
+
|
| 108 |
+
model_kwargs = {
|
| 109 |
+
"mistral_api_key": self.mistral_api_key,
|
| 110 |
+
"model": self.llm_model,
|
| 111 |
+
"max_new_tokens": self.max_new_tokens,
|
| 112 |
+
"top_p": self.top_p,
|
| 113 |
+
"temperature": self.temperature,
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
llm = ChatMistralAI(**model_kwargs)
|
| 117 |
+
|
| 118 |
+
return llm
|
| 119 |
+
|
| 120 |
+
def retrieval_qa_memory_chain(self, db, prompt_template):
|
| 121 |
+
"""_summary_
|
| 122 |
+
"""
|
| 123 |
+
llm = self.load_mistral()
|
| 124 |
+
prompt = self.set_custom_prompt(prompt_template)
|
| 125 |
+
memory = ConversationBufferMemory(
|
| 126 |
+
memory_key = 'history',
|
| 127 |
+
input_key = 'question'
|
| 128 |
+
)
|
| 129 |
+
chain_type_kwargs= {
|
| 130 |
+
"prompt" : prompt,
|
| 131 |
+
"memory" : memory
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
qa_chain = RetrievalQA.from_chain_type(
|
| 135 |
+
llm = llm,
|
| 136 |
+
chain_type = 'stuff',
|
| 137 |
+
retriever = db.as_retriever(search_kwargs={"k":5}),
|
| 138 |
+
chain_type_kwargs = chain_type_kwargs,
|
| 139 |
+
return_source_documents = True,
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
return qa_chain
|
| 143 |
+
|
| 144 |
+
def retrieval_qa_chain(self, db, prompt_template):
|
| 145 |
+
"""_summary_
|
| 146 |
+
"""
|
| 147 |
+
llm = self.load_llm()
|
| 148 |
+
prompt = self.set_custom_prompt(prompt_template)
|
| 149 |
+
|
| 150 |
+
chain_type_kwargs= {
|
| 151 |
+
"prompt" : prompt,
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
qa_chain = RetrievalQA.from_chain_type(
|
| 155 |
+
llm = llm,
|
| 156 |
+
chain_type = 'stuff',
|
| 157 |
+
retriever = db.as_retriever(search_kwargs={"k":3}),
|
| 158 |
+
chain_type_kwargs = chain_type_kwargs,
|
| 159 |
+
return_source_documents = True,
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
return qa_chain
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def get_sources_document(self, source_documents:list) -> dict:
|
| 167 |
+
"""generate dictionnary with path (as a key) and list of pages associated to one path
|
| 168 |
+
|
| 169 |
+
Args:
|
| 170 |
+
source_document (list): list of documents containing source_document of rag response
|
| 171 |
+
|
| 172 |
+
Returns:
|
| 173 |
+
dict: {
|
| 174 |
+
path/to/file1 : [0, 1, 3],
|
| 175 |
+
path/to/file2 : [5, 2]
|
| 176 |
+
}
|
| 177 |
+
"""
|
| 178 |
+
sources = defaultdict(list)
|
| 179 |
+
for doc in source_documents:
|
| 180 |
+
sources[doc.metadata["source"]].append(doc.metadata["page"])
|
| 181 |
+
|
| 182 |
+
return sources
|
| 183 |
+
|
| 184 |
+
def shape_answer_with_source(self, answer: str, sources: dict):
|
| 185 |
+
"""_summary_
|
| 186 |
+
|
| 187 |
+
Args:
|
| 188 |
+
answer (str): _description_
|
| 189 |
+
source (dict): _description_
|
| 190 |
+
"""
|
| 191 |
+
pattern = r"^(.+)\/([^\/]+)$"
|
| 192 |
+
|
| 193 |
+
source_msg = ""
|
| 194 |
+
for path, page in sources.items():
|
| 195 |
+
file = re.findall(pattern, path)[0][1]
|
| 196 |
+
source_msg += f"\nFichier: {file} - Page: {page}"
|
| 197 |
+
|
| 198 |
+
answer += f"\n{source_msg}"
|
| 199 |
+
|
| 200 |
+
return answer
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
langchain==0.1.7
|
| 2 |
+
langchain-core==0.1.23
|
| 3 |
+
langchain-mistralai==0.0.4
|
| 4 |
+
langchain-community==0.0.20
|
| 5 |
+
faiss-cpu==1.7.4
|
| 6 |
+
python-dotenv==1.0.1
|
| 7 |
+
chainlit
|
| 8 |
+
openai
|
| 9 |
+
pypdf==4.0.2
|