from langchain.document_loaders import PyPDFDirectoryLoader loader = PyPDFDirectoryLoader("./data") docs = loader.load() from langchain.text_splitter import RecursiveCharacterTextSplitter text_splitter = RecursiveCharacterTextSplitter( chunk_size = 100, chunk_overlap = 20, length_function = len, ) documents = text_splitter.split_documents(docs) import os os.environ["OPENAI_API_KEY"] = "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" from langchain.embeddings.openai import OpenAIEmbeddings embeddings = OpenAIEmbeddings() from langchain.vectorstores import Chroma persist_directory = "vector_db" vectordb = Chroma.from_documents(docs, embedding = embeddings, persist_directory=persist_directory) vectordb.persist() vectordb = None # As you can see when you run the following cell - # loaded the persisted vectore store is much quicker than reinstantiating it - # and that is the benefit of persist_directory! vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings) # Bring up ChatOpenAI from langchain.chat_models import ChatOpenAI llm = ChatOpenAI(model_name = "gpt-3.5-turbo") # Set up the document vector store as a Retriever tool doc_retriever = vectordb.as_retriever() # Now setup the RetrievalQA chain and leverage all the documents in the Vector DB from langchain.chains import RetrievalQA EN_17272_qa = RetrievalQA.from_chain_type(llm, chain_type="stuff", retriever=doc_retriever) def make_answer(query): return EN_17272_qa.run(query) if __name__ == "__main__": # make a gradio intgerface import gradio as gr gr.Interface( make_answer, [gr.inputs.Textbox(lines=2, label="Input a question")], gr.outputs.Textbox(labels="Answer"), title="EN-17272 & Efficacy Reports", descriptions="EN-17272 & Efficacy reports is a generative model that gives answers based on the documents", ).launch()