LongDocumentQuestioner

Sleeping

File size: 4,298 Bytes

9a514d8
 
 
37bdd2c
e35a59c
9a514d8
 
37bdd2c
3fd3fa4
37bdd2c
9a514d8
3fd3fa4
9a514d8
3fd3fa4
9a514d8
37bdd2c
dc9ded5
 
37bdd2c
dc9ded5
9a514d8
37bdd2c
 
3fd3fa4
 
37bdd2c
9a514d8
d4b788f
37bdd2c
9a514d8
3fd3fa4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a514d8
 
3fd3fa4
 
9a514d8
37bdd2c
3fd3fa4
 
 
 
 
 
37bdd2c
3fd3fa4
 
 
 
 
 
 
37bdd2c
3fd3fa4
 
37bdd2c
 
3fd3fa4
 
37bdd2c
3fd3fa4
 
 
 
 
 
 
 
 
9a514d8
3fd3fa4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a514d8
3fd3fa4

import openai
import os
import gradio as gr
import chromadb
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.indexes import VectorstoreIndexCreator
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI 

def load_document(Document):
    
    # loads a PDF document
    if not Document:
        return "Merci de fournir un document PDF"
    if not Document.name.endswith('.pdf'):
        return ("Merci de fournir un document PDF")

    loader = PyPDFLoader(Document.name)
    docs = loader.load()
    global k
    k = len(docs)
    
    # Create embeddings
    embeddings = OpenAIEmbeddings(openai_api_key = os.environ['OpenaiKey'])
    
    # Write in DB
    global docsearch
    docsearch = Chroma.from_documents(docs, embeddings, ids=["page" + str(d.metadata["page"]) for d in docs], k=1)
    global chat_history
    chat_history = []

    return "Endodage créé"

def get_chat_history(inputs) -> str:
    res = []
    for human, ai in inputs:
        res.append(f"Question : {human}\nRéponse : {ai}")
    return "\n".join(res)

def question_document(Question):
    
    if "docsearch" not in globals():
        return "Merci d'encoder un document PDF"

    # Define LLM
    turbo = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, openai_api_key = os.environ['OpenaiKey'])
    davinci = OpenAI(model_name = "text-davinci-003", openai_api_key = os.environ['OpenaiKey'])

    # Customize map_reduce prompts
    #question_template = """{context}
    #Precise the number starting the above text in your answer. It corresponds to its page number in the document it is from. Label this number as "page".
    #Also make sure to answer in the same langage than the following question.
    #QUESTION : {question}
    #ANSWER :
    #"""
    
    #combine_template = """{summaries}
    #Note that the above text is based on transient extracts from one source document.
    #So make sure to not mention different documents or extracts or passages or portions or texts. There is only one, entire document. 
    #Also make sure to answer in the same langage than the following question. 
    #QUESTION : {question}.
    #ANSWER :
    #"""
    
    #question_prompt = PromptTemplate(template = question_template, input_variables=['context', 'question'])
    #combine_prompt = PromptTemplate(template = combine_template, input_variables=['summaries', 'question'])
    
    # Define chain  
    #chain_type_kwargs = { "combine_prompt" : combine_prompt, "question_prompt" : question_prompt} #, "return_intermediate_steps" : True}
    #qa = RetrievalQAWithSourcesChain.from_chain_type(llm = llm, chain_type = "map_reduce", chain_type_kwargs = chain_type_kwargs, retriever=docsearch.as_retriever(), return_source_documents = True)
   
    vectordbkwargs = {"search_distance": 10}
    search_kwargs={"k" : k}

    qa = ConversationalRetrievalChain.from_llm(llm = turbo, chain_type = "map_reduce",retriever=docsearch.as_retriever(search_kwargs = search_kwargs), get_chat_history = get_chat_history, return_source_documents = True)
    answer = qa({"question" : Question,"chat_history":chat_history, "vectordbkwargs": vectordbkwargs}, return_only_outputs = True)
    chat_history.append((Question, answer["answer"]))
    #answer = qa({"question" : Question}, )
    print(answer)
    return "".join(get_chat_history(chat_history))

with gr.Blocks() as demo:
    
    gr.Markdown(
    """
    # Interrogateur de PDF
    par Nicolas et Alex
    """)
    
    with gr.Row():
      
      with gr.Column():
        input_file = gr.inputs.File(label="Charger un document")
        greet_btnee = gr.Button("Encoder le document")
        output_words = gr.outputs.Textbox(label="Encodage")
        greet_btnee.click(fn=load_document, inputs=input_file, outputs = output_words)

      with gr.Column():
        text = gr.inputs.Textbox(label="Question")
        greet_btn = gr.Button("Poser une question")
        answer = gr.Textbox(label = "Réponse", lines = 8)
        greet_btn.click(fn = question_document, inputs = text, outputs = answer)
        
      
demo.launch()