Spaces:

Volkopat
/

arXivGPT

Runtime error

App Files Files Community

Volko commited on Apr 13, 2023

Commit

d98144d

1 Parent(s): c7b2ed6

Version 1.0

Browse files

Files changed (4) hide show

app.py +138 -0
pdf2vectorstore.py +72 -0
requirements.txt +11 -0
template.py +18 -0

app.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import os
+import pickle
+from typing import Optional, Tuple
+import gradio as gr
+from threading import Lock
+from langchain.llms import OpenAI
+from langchain.chains import ChatVectorDBChain
+from template import QA_PROMPT, CONDENSE_QUESTION_PROMPT
+from pdf2vectorstore import convert_to_vectorstore
+def get_chain(api_key, vectorstore, model_name):
+    llm = OpenAI(model_name = model_name, temperature=0,  openai_api_key=api_key)
+    qa_chain = ChatVectorDBChain.from_llm(
+        llm,
+        vectorstore,
+        qa_prompt=QA_PROMPT,
+        condense_question_prompt=CONDENSE_QUESTION_PROMPT,
+    )
+    return qa_chain
+def set_openai_api_key(api_key: str, vectorstore, model_name: str):
+    if api_key:
+        chain = get_chain(api_key, vectorstore, model_name)
+        return chain
+class ChatWrapper:
+    def __init__(self):
+        self.lock = Lock()
+        self.previous_url = ""
+        self.vectorstore_state = None
+        self.chain = None
+    def __call__(
+        self,
+        api_key: str,
+        arxiv_url: str,
+        inp: str,
+        history: Optional[Tuple[str, str]],
+        model_name: str,
+    ):
+        if not arxiv_url or not api_key:
+            history = history or []
+            history.append((inp, "Please provide both arXiv URL and API key to begin"))
+            return history, history
+        if arxiv_url != self.previous_url:
+            history = []
+            vectorstore = convert_to_vectorstore(arxiv_url, api_key)
+            self.previous_url = arxiv_url
+            self.chain  = set_openai_api_key(api_key, vectorstore, model_name)
+            self.vectorstore_state = vectorstore
+        if self.chain  is None:
+            self.chain  = set_openai_api_key(api_key, self.vectorstore_state, model_name)
+        self.lock.acquire()
+        try:
+            history = history or []
+            if self.chain  is None:
+                history.append((inp, "Please paste your OpenAI key to use"))
+                return history, history
+            import openai
+            openai.api_key = api_key
+            output = self.chain ({"question": inp, "chat_history": history})["answer"]
+            history.append((inp, output))
+        except Exception as e:
+            raise e
+        finally:
+            api_key = ""
+            self.lock.release()
+        return history, history
+chat = ChatWrapper()
+block = gr.Blocks(css=".gradio-container {background-color: #f8f8f8; font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif}")
+with block:
+    gr.HTML("<h1 style='text-align: center;'>ArxivGPT</h1>")
+    gr.HTML("<h3 style='text-align: center;'>Ask questions about research papers</h3>")
+    with gr.Row():
+        with gr.Column(width="auto"):
+            openai_api_key_textbox = gr.Textbox(
+                label="OpenAI API Key",
+                placeholder="Paste your OpenAI API key (sk-...)",
+                show_label=True,
+                lines=1,
+                type="password",
+            )
+        with gr.Column(width="auto"):
+            arxiv_url_textbox = gr.Textbox(
+                label="Arxiv URL",
+                placeholder="Enter the arXiv URL",
+                show_label=True,
+                lines=1,
+            )
+        with gr.Column(width="auto"):
+            model_dropdown = gr.Dropdown(
+                label="Choose a model (GPT-4 coming soon!)",
+                choices=["gpt-3.5-turbo"],
+            )
+    chatbot = gr.Chatbot()
+    with gr.Row():
+        message = gr.Textbox(
+            label="What's your question?",
+            placeholder="Ask questions about the paper you just linked",
+            lines=1,
+        )
+        submit = gr.Button(value="Send", variant="secondary").style(full_width=False)
+    gr.Examples(
+        examples=[
+            "Please give me a brief summary about this paper",
+            "Are there any interesting correlations in the given paper?",
+            "How can this paper be applied in the real world?",
+            "What are the limitations of this paper?",
+        ],
+        inputs=message,
+    )
+    gr.HTML(
+        "<center style='margin-top: 20px;'>Powered by <a href='https://github.com/hwchase17/langchain'>LangChain 🦜️🔗</a></center>"
+    )
+    state = gr.State()
+    submit.click(chat,
+                 inputs=[openai_api_key_textbox, arxiv_url_textbox, message, state, model_dropdown],
+                 outputs=[chatbot, state])
+    message.submit(chat,
+                   inputs=[openai_api_key_textbox, arxiv_url_textbox, message, state, model_dropdown],
+                   outputs=[chatbot, state])
+block.launch(share=True, debug=True, width=800)

pdf2vectorstore.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import os
+import requests
+from bs4 import BeautifulSoup
+from pdf2image import convert_from_path
+import pytesseract
+import pickle
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.document_loaders import UnstructuredFileLoader
+from langchain.vectorstores.faiss import FAISS
+from langchain.embeddings import OpenAIEmbeddings
+def download_pdf(url, filename):
+    print("Downloading pdf...")
+    response = requests.get(url, stream=True)
+    with open(filename, 'wb') as f:
+        for chunk in response.iter_content(chunk_size=8192):
+            f.write(chunk)
+def extract_pdf_text(filename):
+    print("Extracting text from pdf...")
+    pytesseract.pytesseract.tesseract_cmd = 'tesseract'
+    images = convert_from_path(filename)
+    text = ""
+    for image in images:
+        text += pytesseract.image_to_string(image)
+    return text
+def get_arxiv_pdf_url(paper_link):
+    if paper_link.endswith('.pdf'):
+        return paper_link
+    else:
+        print("Getting pdf url...")
+        response = requests.get(paper_link)
+        soup = BeautifulSoup(response.text, 'html.parser')
+        pdf_url = soup.find('a', {'class': 'mobile-submission-download'})['href']
+        pdf_url = 'https://arxiv.org' + pdf_url
+        return pdf_url
+def read_paper(paper_link):
+    print("Reading paper...")
+    pdf_filename = 'paper.pdf'
+    pdf_url = get_arxiv_pdf_url(paper_link)
+    download_pdf(pdf_url, pdf_filename)
+    text = extract_pdf_text(pdf_filename)
+    os.remove(pdf_filename)
+    return text
+def convert_to_vectorstore(arxiv_url, api_key):
+    if not arxiv_url or not api_key:
+        return None
+    print("Converting to vectorstore...")
+    txtfile = "paper.txt"
+    with open(txtfile, 'w') as f:
+        f.write(read_paper(arxiv_url))
+    loader = UnstructuredFileLoader(txtfile)
+    raw_documents = loader.load()
+    os.remove(txtfile)
+    print("Loaded document")
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
+    documents = text_splitter.split_documents(raw_documents)
+    os.environ["OPENAI_API_KEY"] = api_key
+    embeddings = OpenAIEmbeddings()
+    os.environ["OPENAI_API_KEY"] = ""
+    vectorstore = FAISS.from_documents(documents, embeddings)
+    return vectorstore

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+requests
+beautifulsoup4
+pdfminer.six
+PyMuPDF
+pdf2image
+pytesseract
+unstructured
+gradio
+faiss-cpu
+langchain
+tiktoken

template.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from langchain.prompts.prompt import PromptTemplate
+_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.
+Chat History:
+{chat_history}
+Follow Up Input: {question}
+Standalone question:"""
+CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)
+template = """You are an AI assistant for answering questions about the contents of the research paper in Arxiv.
+You are given the following extracted parts of a long document and a question. Provide a conversational answer.
+If you don't know the answer, just say "Hmm, I'm not sure." Don't try to make up an answer.
+Question: {question}
+=========
+{context}
+=========
+Answer in Markdown:"""
+QA_PROMPT = PromptTemplate(template=template, input_variables=["question", "context"])