Jonglee commited on
Commit
aa3ec35
·
1 Parent(s): a3c2251

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -0
app.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.document_loaders import PyPDFDirectoryLoader
2
+
3
+ loader = PyPDFDirectoryLoader("./data")
4
+ docs = loader.load()
5
+
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+
8
+ text_splitter = RecursiveCharacterTextSplitter(
9
+ chunk_size = 100,
10
+ chunk_overlap = 20,
11
+ length_function = len,
12
+ )
13
+
14
+ documents = text_splitter.split_documents(docs)
15
+
16
+ import os
17
+ os.environ["OPENAI_API_KEY"] = "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"
18
+
19
+ from langchain.embeddings.openai import OpenAIEmbeddings
20
+ embeddings = OpenAIEmbeddings()
21
+
22
+ from langchain.vectorstores import Chroma
23
+ persist_directory = "vector_db"
24
+ vectordb = Chroma.from_documents(docs, embedding = embeddings, persist_directory=persist_directory)
25
+
26
+ vectordb.persist()
27
+ vectordb = None
28
+
29
+ # As you can see when you run the following cell -
30
+ # loaded the persisted vectore store is much quicker than reinstantiating it -
31
+ # and that is the benefit of persist_directory!
32
+ vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
33
+
34
+ # Bring up ChatOpenAI
35
+ from langchain.chat_models import ChatOpenAI
36
+ llm = ChatOpenAI(model_name = "gpt-3.5-turbo")
37
+
38
+ # Set up the document vector store as a Retriever tool
39
+ doc_retriever = vectordb.as_retriever()
40
+
41
+ # Now setup the RetrievalQA chain and leverage all the documents in the Vector DB
42
+ from langchain.chains import RetrievalQA
43
+ EN_17272_qa = RetrievalQA.from_chain_type(llm, chain_type="stuff", retriever=doc_retriever)
44
+