File size: 4,749 Bytes
be63200
db70198
375bd04
db70198
 
 
 
 
 
1dc9fa7
1ce831d
 
 
 
 
 
db70198
7713f97
1ce831d
68eaa27
db70198
18a32c9
b5bc349
 
 
18a32c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7713f97
 
 
3373c54
db70198
18a32c9
1ce831d
db70198
 
 
1ce831d
db70198
 
 
7a185b4
 
 
 
 
 
 
 
 
 
1ce831d
 
7a185b4
 
 
 
18a32c9
db70198
 
 
 
 
 
 
b5bc349
db70198
 
 
 
 
be63200
de20d93
db70198
259cbe8
9e53bcd
18a32c9
 
 
1ce831d
 
 
18a32c9
 
 
 
 
db70198
18a32c9
 
 
 
3373c54
18a32c9
 
 
db70198
18a32c9
1ce831d
18a32c9
 
 
 
 
 
 
 
 
 
 
 
 
 
1ce831d
18a32c9
7713f97
18a32c9
 
 
 
b5bc349
18a32c9
 
7713f97
18a32c9
 
7713f97
18a32c9
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import os
import tempfile

import streamlit as st
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.memory import ConversationBufferMemory
from langchain.memory.chat_message_histories import StreamlitChatMessageHistory
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import (
    Docx2txtLoader,
    PyPDFLoader,
    TextLoader,
    UnstructuredEPubLoader,
)
from langchain_community.vectorstores import DocArrayInMemorySearch

from calback_handler import PrintRetrievalHandler, StreamHandler
from chat_profile import ChatProfileRoleEnum

# configs
LLM_MODEL_NAME = "gpt-3.5-turbo"
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"

st.set_page_config(
    page_title=":books: InkChatGPT: Chat with Documents",
    page_icon="πŸ“š",
    initial_sidebar_state="collapsed",
    menu_items={
        "Get Help": "https://x.com/vinhnx",
        "Report a bug": "https://github.com/vinhnx/InkChatGPT/issues",
        "About": "InkChatGPT is a Streamlit application that allows users to upload PDF documents and engage in a conversational Q&A with a language model (LLM) based on the content of those documents.",
    },
)

st.image("./assets/icon.jpg", width=100)
st.header(
    ":gray[:books: InkChatGPT]",
    divider="blue",
)
st.write("**Chat** with Documents")

# Setup memory for contextual conversation
msgs = StreamlitChatMessageHistory()


@st.cache_resource(ttl="1h")
def configure_retriever(files):
    # Read documents
    docs = []
    temp_dir = tempfile.TemporaryDirectory()
    for file in files:
        temp_filepath = os.path.join(temp_dir.name, file.name)
        with open(temp_filepath, "wb") as f:
            f.write(file.getvalue())

        _, extension = os.path.splitext(temp_filepath)

        # Load the file using the appropriate loader
        if extension == ".pdf":
            loader = PyPDFLoader(temp_filepath)
        elif extension == ".docx":
            loader = Docx2txtLoader(temp_filepath)
        elif extension == ".txt":
            loader = TextLoader(temp_filepath)
        elif extension == ".epub":
            loader = UnstructuredEPubLoader(temp_filepath)
        else:
            st.write("This document format is not supported!")
            return None

        # loader = PyPDFLoader(temp_filepath)
        docs.extend(loader.load())

    # Split documents
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
    splits = text_splitter.split_documents(docs)

    # Create embeddings and store in vectordb
    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
    vectordb = DocArrayInMemorySearch.from_documents(splits, embeddings)

    # Define retriever
    retriever = vectordb.as_retriever(
        search_type="mmr", search_kwargs={"k": 2, "fetch_k": 4}
    )

    return retriever


with st.sidebar.expander("Documents"):
    st.subheader("Files")
    uploaded_files = st.file_uploader(
        label="Select files",
        type=["pdf", "txt", "docx", "epub"],
        accept_multiple_files=True,
    )

with st.sidebar.expander("Setup"):
    st.subheader("API Key")
    openai_api_key = st.text_input("OpenAI API Key", type="password")

    is_empty_chat_messages = len(msgs.messages) == 0
    if is_empty_chat_messages or st.button("Clear message history"):
        msgs.clear()
        msgs.add_ai_message("How can I help you?")

if not openai_api_key:
    st.info("Please add your OpenAI API key in the sidebar to continue.")
    st.stop()

if uploaded_files:
    result_retriever = configure_retriever(uploaded_files)

    memory = ConversationBufferMemory(
        memory_key="chat_history", chat_memory=msgs, return_messages=True
    )

    # Setup LLM and QA chain
    llm = ChatOpenAI(
        model_name=LLM_MODEL_NAME,
        openai_api_key=openai_api_key,
        temperature=0,
        streaming=True,
    )

    chain = ConversationalRetrievalChain.from_llm(
        llm, retriever=result_retriever, memory=memory, verbose=False
    )

    avatars = {
        ChatProfileRoleEnum.Human: "user",
        ChatProfileRoleEnum.AI: "assistant",
    }

    for msg in msgs.messages:
        st.chat_message(avatars[msg.type]).write(msg.content)

    if user_query := st.chat_input(placeholder="Ask me anything!"):
        st.chat_message("user").write(user_query)

        with st.chat_message("assistant"):
            retrieval_handler = PrintRetrievalHandler(st.empty())
            stream_handler = StreamHandler(st.empty())
            response = chain.run(
                user_query, callbacks=[retrieval_handler, stream_handler]
            )