RAG_PAC

Running

App Files Files Community

Perrine commited on Feb 27

Commit

b18c318

•

1 Parent(s): b7d87ea

Upload 18 files

Browse files

Files changed (19) hide show

.gitattributes +2 -0
Dockerfile +17 -0
chainlit.md +14 -0
data/2307_Pac-coup-doeil_BRO_BD.pdf +3 -0
data/AB-Corse-2024_notice.pdf +0 -0
data/AB-Hexagone-2024_notice.pdf +0 -0
data/AC-2024_notice.pdf +0 -0
data/ADMCA-2024_notice.pdf +0 -0
data/AO-2024_notice.pdf +0 -0
data/APR-2024_notice.pdf +0 -0
data/Conditionnalite-2023_fiche-technique_presentation-generale.pdf +0 -0
data/PAB-2024_notice.pdf +0 -0
data/PPR-2024_notice.pdf +0 -0
data/VSLM-2024_notice.pdf +0 -0
data/vector_store/index.faiss +3 -0
data/vector_store/index.pkl +3 -0
rag_app.py +70 -0
rag_module.py +200 -0
requirements.txt +9 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/2307_Pac-coup-doeil_BRO_BD.pdf filter=lfs diff=lfs merge=lfs -text
+data/vector_store/index.faiss filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,17 @@

+FROM python:3.10
+WORKDIR /home/usr/
+COPY ./requirements.txt /home/usr/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /home/usr/requirements.txt
+RUN mkdir .files/
+COPY . .
+COPY .chainlit .chainlit
+CMD ["chainlit", "run", "/home/usr/rag_app.py", "--host", "0.0.0.0",  "--port", "7860"]
+# CMD ["ls", "-a"]
+# RUN chmod 755 /usr/local/lib/python3.10/

chainlit.md ADDED Viewed

	@@ -0,0 +1,14 @@

+# Welcome to Chainlit! 🚀🤖
+Hi there, Developer! 👋 We're excited to have you on board. Chainlit is a powerful tool designed to help you prototype, debug and share applications built on top of LLMs.
+## Useful Links 🔗
+- **Documentation:** Get started with our comprehensive [Chainlit Documentation](https://docs.chainlit.io) 📚
+- **Discord Community:** Join our friendly [Chainlit Discord](https://discord.gg/k73SQ3FyUh) to ask questions, share your projects, and connect with other developers! 💬
+We can't wait to see what you create with Chainlit! Happy coding! 💻😊
+## Welcome screen
+To modify the welcome screen, edit the `chainlit.md` file at the root of your project. If you do not want a welcome screen, just leave this file empty.

data/2307_Pac-coup-doeil_BRO_BD.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a19e16f6788c332870968a0588fb911344454368e2cba765ccd822fa322edde3
+size 2431848

data/AB-Corse-2024_notice.pdf ADDED Viewed

Binary file (280 kB). View file

data/AB-Hexagone-2024_notice.pdf ADDED Viewed

Binary file (276 kB). View file

data/AC-2024_notice.pdf ADDED Viewed

Binary file (271 kB). View file

data/ADMCA-2024_notice.pdf ADDED Viewed

Binary file (279 kB). View file

data/AO-2024_notice.pdf ADDED Viewed

Binary file (289 kB). View file

data/APR-2024_notice.pdf ADDED Viewed

Binary file (281 kB). View file

data/Conditionnalite-2023_fiche-technique_presentation-generale.pdf ADDED Viewed

Binary file (721 kB). View file

data/PAB-2024_notice.pdf ADDED Viewed

Binary file (373 kB). View file

data/PPR-2024_notice.pdf ADDED Viewed

Binary file (260 kB). View file

data/VSLM-2024_notice.pdf ADDED Viewed

Binary file (207 kB). View file

data/vector_store/index.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4f84a34da5d2ce56a8ec5da735f30c382dd7df98ee7c2ba4dc19b8f30073ab01
+size 2347053

data/vector_store/index.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:604e1c8d73553c5dc4edaab0359a3cf8bf92405da4f446a77d153f64b4b9ce98
+size 561617

rag_app.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import re
+import json
+import chainlit as cl
+from langchain_community.vectorstores import FAISS
+from rag_module import RagModule
+from collections import defaultdict
+prompt_template = """
+Tu t'appelles GAIA et tu travailles pour Politique Agricole Commune (PAC). Tu es un agent intelligent spécialisé sur les aides financières agricoles.
+Tu es chargé de donner des conseils sur les aides financières disponibles pour les agriculteurs.
+Tu comprends et génère les réponses en français, jamais en anglais.
+Merci de bien vouloir répondre aux questions en utilisant seulement le contexte suivant.
+contexte: {context}
+historique: {history}
+question: {question}
+réponse:
+"""
+##------------ CHAINLIT ---------------##
+@cl.on_chat_start
+async def start():
+    rag = RagModule()
+    db = rag.get_faiss_db()
+    qa_chain = rag.retrieval_qa_memory_chain(db, prompt_template)
+    msg = cl.Message(content="Lancement du bot...", author = "Gaia")
+    await msg.send()
+    msg.content = "Bonjour et bienvenue sur le bot qui connait tout sur toutes les aides de la PAC (ou à peu près). Quelle est ta question ?"
+    await msg.update()
+    cl.user_session.set("chain", qa_chain)
+@cl.on_message
+async def main(message):
+    rag = RagModule()
+    chain = cl.user_session.get("chain")
+    cb = cl.AsyncLangchainCallbackHandler(
+        stream_final_answer = True,
+        answer_prefix_tokens=["FINAL", "ANSWER"]
+    )
+    cb.answer_reached=True
+    response = await chain.ainvoke(message.content, callbacks=[cb])
+    answer = response.get('result')
+    sources = rag.get_sources_document(response.get('source_documents'))
+    elements = [cl.Pdf(name = "Pdf", display ="inline", path = path) for path in sources]
+    if response.get('source_documents'):
+        answer = rag.shape_answer_with_source(answer, sources)
+    else:
+        answer += f"\nNo sources found"
+    await cl.Message(content=answer, elements=elements, author="Gaia").send()
+    # await cl.Message(content=answer, author = "Chatbot Eureden").send()

rag_module.py ADDED Viewed

	@@ -0,0 +1,200 @@

+#load & split data
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+# embed data
+from langchain_mistralai.embeddings import MistralAIEmbeddings
+# vector store
+from langchain_community.vectorstores import FAISS
+# prompt
+from langchain.prompts import PromptTemplate
+# memory
+from langchain.memory import ConversationBufferMemory
+#llm
+from langchain_mistralai.chat_models import ChatMistralAI
+#chain modules
+from langchain.chains import RetrievalQA
+# import PyPDF2
+import os
+import re
+from dotenv import load_dotenv
+load_dotenv()
+from collections import defaultdict
+class RagModule():
+    def __init__(self):
+        self.mistral_api_key = "Yb2kAF0DR4Mva5AEmoYFV3kYRAKdXB7i"
+        self.model_name_embedding = "mistral-embed"
+        self.embedding_model = MistralAIEmbeddings(model=self.model_name_embedding, mistral_api_key=self.mistral_api_key)
+        self.chunk_size = 1000
+        self.chunk_overlap = 120
+        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
+        self.db_faiss_path = "data/vector_store"
+        #params llm
+        self.llm_model = "mistral-small"
+        self.max_new_tokens = 512
+        self.top_p = 0.5
+        self.temperature = 0.1
+    def read_pdf_file(self, file):
+        """read pdf file
+        Args:
+            file (_type_): _description_
+        Returns:
+            _type_: _description_
+        """
+        pdf = PyPDF2.PdfReader(file.path)
+        pdf_text = ""
+        for page in pdf.pages:
+            pdf_text += page.extract_text()
+        return pdf_text
+    def split_text(self, text:str) -> list:
+        """Split the text into chunk
+        Args:
+            text (str): _description_
+        Returns:
+            list: _description_
+        """
+        texts = self.text_splitter.split_text(text)
+        return texts
+    def get_metadata(self, texts:list) -> list:
+        """_summary_
+        Args:
+            texts (list): _description_
+        Returns:
+            list: _description_
+        """
+        metadatas = [{"source": f'Paragraphe: {i}'} for i in range(len(texts))]
+        return metadatas
+    def get_faiss_db(self):
+        """load local faiss vector store containing all embeddings
+        """
+        db = FAISS.load_local(self.db_faiss_path, self.embedding_model)
+        return db
+    def set_custom_prompt(self, prompt_template:str):
+        """Instantiate prompt template for Q&A retreival for each vectore stores
+        Args:
+            prompt_template (str): description of the prompt
+            input_variables (list): variables in the prompt
+        """
+        prompt = PromptTemplate.from_template(
+            template=prompt_template,
+            )
+        return prompt
+    def load_mistral(self):
+        """instantiate LLM
+        """
+        model_kwargs = {
+        "mistral_api_key": self.mistral_api_key,
+        "model": self.llm_model,
+        "max_new_tokens": self.max_new_tokens,
+        "top_p": self.top_p,
+        "temperature": self.temperature,
+        }
+        llm = ChatMistralAI(**model_kwargs)
+        return llm
+    def retrieval_qa_memory_chain(self, db, prompt_template):
+        """_summary_
+        """
+        llm = self.load_mistral()
+        prompt = self.set_custom_prompt(prompt_template)
+        memory = ConversationBufferMemory(
+            memory_key = 'history',
+            input_key = 'question'
+        )
+        chain_type_kwargs= {
+            "prompt" : prompt,
+            "memory" : memory
+            }
+        qa_chain = RetrievalQA.from_chain_type(
+            llm = llm,
+            chain_type = 'stuff',
+            retriever = db.as_retriever(search_kwargs={"k":5}),
+            chain_type_kwargs = chain_type_kwargs,
+            return_source_documents = True,
+            )
+        return qa_chain
+    def retrieval_qa_chain(self, db, prompt_template):
+        """_summary_
+        """
+        llm = self.load_llm()
+        prompt = self.set_custom_prompt(prompt_template)
+        chain_type_kwargs= {
+            "prompt" : prompt,
+            }
+        qa_chain = RetrievalQA.from_chain_type(
+            llm = llm,
+            chain_type = 'stuff',
+            retriever = db.as_retriever(search_kwargs={"k":3}),
+            chain_type_kwargs = chain_type_kwargs,
+            return_source_documents = True,
+            )
+        return qa_chain
+    def get_sources_document(self, source_documents:list) -> dict:
+        """generate dictionnary with path (as a key) and list of pages associated to one path
+        Args:
+            source_document (list): list of documents containing source_document of rag response
+        Returns:
+            dict: {
+                path/to/file1 : [0, 1, 3],
+                path/to/file2 : [5, 2]
+                }
+        """
+        sources = defaultdict(list)
+        for doc in source_documents:
+            sources[doc.metadata["source"]].append(doc.metadata["page"])
+        return sources
+    def shape_answer_with_source(self, answer: str, sources: dict):
+        """_summary_
+        Args:
+            answer (str): _description_
+            source (dict): _description_
+        """
+        pattern = r"^(.+)\/([^\/]+)$"
+        source_msg = ""
+        for path, page in sources.items():
+            file = re.findall(pattern, path)[0][1]
+            source_msg += f"\nFichier: {file} - Page: {page}"
+        answer += f"\n{source_msg}"
+        return answer

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+langchain==0.1.7
+langchain-core==0.1.23
+langchain-mistralai==0.0.4
+langchain-community==0.0.20
+faiss-cpu==1.7.4
+python-dotenv==1.0.1
+chainlit
+openai
+pypdf==4.0.2