Spaces:
Sleeping
Sleeping
File size: 4,749 Bytes
be63200 db70198 375bd04 db70198 1dc9fa7 1ce831d db70198 7713f97 1ce831d 68eaa27 db70198 18a32c9 b5bc349 18a32c9 7713f97 3373c54 db70198 18a32c9 1ce831d db70198 1ce831d db70198 7a185b4 1ce831d 7a185b4 18a32c9 db70198 b5bc349 db70198 be63200 de20d93 db70198 259cbe8 9e53bcd 18a32c9 1ce831d 18a32c9 db70198 18a32c9 3373c54 18a32c9 db70198 18a32c9 1ce831d 18a32c9 1ce831d 18a32c9 7713f97 18a32c9 b5bc349 18a32c9 7713f97 18a32c9 7713f97 18a32c9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
import os
import tempfile
import streamlit as st
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.memory import ConversationBufferMemory
from langchain.memory.chat_message_histories import StreamlitChatMessageHistory
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import (
Docx2txtLoader,
PyPDFLoader,
TextLoader,
UnstructuredEPubLoader,
)
from langchain_community.vectorstores import DocArrayInMemorySearch
from calback_handler import PrintRetrievalHandler, StreamHandler
from chat_profile import ChatProfileRoleEnum
# configs
LLM_MODEL_NAME = "gpt-3.5-turbo"
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
st.set_page_config(
page_title=":books: InkChatGPT: Chat with Documents",
page_icon="π",
initial_sidebar_state="collapsed",
menu_items={
"Get Help": "https://x.com/vinhnx",
"Report a bug": "https://github.com/vinhnx/InkChatGPT/issues",
"About": "InkChatGPT is a Streamlit application that allows users to upload PDF documents and engage in a conversational Q&A with a language model (LLM) based on the content of those documents.",
},
)
st.image("./assets/icon.jpg", width=100)
st.header(
":gray[:books: InkChatGPT]",
divider="blue",
)
st.write("**Chat** with Documents")
# Setup memory for contextual conversation
msgs = StreamlitChatMessageHistory()
@st.cache_resource(ttl="1h")
def configure_retriever(files):
# Read documents
docs = []
temp_dir = tempfile.TemporaryDirectory()
for file in files:
temp_filepath = os.path.join(temp_dir.name, file.name)
with open(temp_filepath, "wb") as f:
f.write(file.getvalue())
_, extension = os.path.splitext(temp_filepath)
# Load the file using the appropriate loader
if extension == ".pdf":
loader = PyPDFLoader(temp_filepath)
elif extension == ".docx":
loader = Docx2txtLoader(temp_filepath)
elif extension == ".txt":
loader = TextLoader(temp_filepath)
elif extension == ".epub":
loader = UnstructuredEPubLoader(temp_filepath)
else:
st.write("This document format is not supported!")
return None
# loader = PyPDFLoader(temp_filepath)
docs.extend(loader.load())
# Split documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
# Create embeddings and store in vectordb
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
vectordb = DocArrayInMemorySearch.from_documents(splits, embeddings)
# Define retriever
retriever = vectordb.as_retriever(
search_type="mmr", search_kwargs={"k": 2, "fetch_k": 4}
)
return retriever
with st.sidebar.expander("Documents"):
st.subheader("Files")
uploaded_files = st.file_uploader(
label="Select files",
type=["pdf", "txt", "docx", "epub"],
accept_multiple_files=True,
)
with st.sidebar.expander("Setup"):
st.subheader("API Key")
openai_api_key = st.text_input("OpenAI API Key", type="password")
is_empty_chat_messages = len(msgs.messages) == 0
if is_empty_chat_messages or st.button("Clear message history"):
msgs.clear()
msgs.add_ai_message("How can I help you?")
if not openai_api_key:
st.info("Please add your OpenAI API key in the sidebar to continue.")
st.stop()
if uploaded_files:
result_retriever = configure_retriever(uploaded_files)
memory = ConversationBufferMemory(
memory_key="chat_history", chat_memory=msgs, return_messages=True
)
# Setup LLM and QA chain
llm = ChatOpenAI(
model_name=LLM_MODEL_NAME,
openai_api_key=openai_api_key,
temperature=0,
streaming=True,
)
chain = ConversationalRetrievalChain.from_llm(
llm, retriever=result_retriever, memory=memory, verbose=False
)
avatars = {
ChatProfileRoleEnum.Human: "user",
ChatProfileRoleEnum.AI: "assistant",
}
for msg in msgs.messages:
st.chat_message(avatars[msg.type]).write(msg.content)
if user_query := st.chat_input(placeholder="Ask me anything!"):
st.chat_message("user").write(user_query)
with st.chat_message("assistant"):
retrieval_handler = PrintRetrievalHandler(st.empty())
stream_handler = StreamHandler(st.empty())
response = chain.run(
user_query, callbacks=[retrieval_handler, stream_handler]
)
|