en17272 / app.py
Jonglee's picture
Update app.py
2a677e6
from langchain.document_loaders import PyPDFDirectoryLoader
loader = PyPDFDirectoryLoader("./data")
docs = loader.load()
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
chunk_size = 100,
chunk_overlap = 20,
length_function = len,
)
documents = text_splitter.split_documents(docs)
import os
os.environ["OPENAI_API_KEY"] = "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"
from langchain.embeddings.openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
from langchain.vectorstores import Chroma
persist_directory = "vector_db"
vectordb = Chroma.from_documents(docs, embedding = embeddings, persist_directory=persist_directory)
vectordb.persist()
vectordb = None
# As you can see when you run the following cell -
# loaded the persisted vectore store is much quicker than reinstantiating it -
# and that is the benefit of persist_directory!
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
# Bring up ChatOpenAI
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name = "gpt-3.5-turbo")
# Set up the document vector store as a Retriever tool
doc_retriever = vectordb.as_retriever()
# Now setup the RetrievalQA chain and leverage all the documents in the Vector DB
from langchain.chains import RetrievalQA
EN_17272_qa = RetrievalQA.from_chain_type(llm, chain_type="stuff", retriever=doc_retriever)
def make_answer(query):
return EN_17272_qa.run(query)
if __name__ == "__main__":
# make a gradio intgerface
import gradio as gr
gr.Interface(
make_answer,
[gr.inputs.Textbox(lines=2, label="Input a question")],
gr.outputs.Textbox(labels="Answer"),
title="EN-17272 & Efficacy Reports",
descriptions="EN-17272 & Efficacy reports is a generative model that gives answers based on the documents",
).launch()