Mdean77's picture
Experimenting with user interface
2a03ddd
import os
from dotenv import load_dotenv
import time
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
import chainlit as cl
import pymupdf
import tiktoken
from langchain_core.documents.base import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import getVectorstore
from getVectorstore import getVectorstore
from qdrant_client.http import models as rest
from langchain.prompts import ChatPromptTemplate
import prompts
from prompts import rag_prompt_template
from defaults import default_llm
from operator import itemgetter
from langchain.schema.output_parser import StrOutputParser
from datetime import date
from queries import summary_query
from queries import background_query
from queries import number_of_participants_query
from queries import study_procedures_query
from queries import alt_procedures_query
from queries import risks_query
from queries import benefits_query
@cl.on_chat_start
async def on_chat_start():
files = await cl.AskFileMessage(
content="Upload a file to proceed",
accept=["application/pdf"],
max_size_mb=50,
timeout=180,
).send()
file = files[0]
print(f"filename is {file.name}")
doc = pymupdf.Document(file.path)
toc = doc.get_toc()
# Want to find the List Of Figures page because that is the last page I want to skip
# Default is 1 if I do not find better start location
start_page = 1
for _, title, page in toc:
if title == "List of Figures":
print(f"{title} on page {page}")
start_page = page + 1
# get the last page I want included
# default is last page of document
end_page = len(doc)
for _, title, page in toc:
if ("References" in title) or ("Bibliography" in title):
print(f"{title} on page {page}")
end_page = page
print(f"Extraction should start on page {start_page} and end on page {end_page}")
# need a rect that will exclude headers and footers
rect = pymupdf.Rect(0.0, 100.0, 612.0, 650.0)
#capture the first 2 page
extracted_text = ""
for page in doc.pages():
if (start_page != 1 and page.number in [0, 1, 2]):
extracted_text += page.get_text()
elif page.number in range(start_page-1, end_page):
# print(page.get_text(clip=rect))
extracted_text += page.get_text(clip=rect)
msg = cl.Message(
content=f"""Processing selected file: `{file.name}`...
Extraction beginning on page {start_page} and ending on page {end_page}.
Using a clipping rectangle to exclude headers and footers ({rect}).
Processed {end_page - start_page} pages of PDF document.
Length of extracted text string is {len(extracted_text)}
"""
)
await msg.send()
chunk_size = 3000
chunk_overlap = 200
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap = chunk_overlap,
# length_function = tiktoken_len
)
text_chunks = text_splitter.split_text(extracted_text)
document = [Document(page_content=chunk) for chunk in text_chunks]
msg = cl.Message(
content=f"""Splitting the text with a recursive character splitter.
Set chunk size at {chunk_size} and overlap at {chunk_overlap}.
Number of resulting chunks: {len(text_chunks)}.
Document created from chunks to get stored in vector database.
Length of the document: {len(document)} (should be same as number of chunks).
"""
)
await msg.send()
qdrant_vectorstore = getVectorstore(document, file.name)
# My vectorstore may have multiple protocols or documents that have been stored and persisted.
# But I only want the context of the current session to relate to a document that I just processed
# so I need to pass in the title of the document. This will act as a filter for the retrieved
# chunks.
protocol_retriever = qdrant_vectorstore.as_retriever(
search_kwargs={
'filter': rest.Filter(
must=[
rest.FieldCondition(
key="metadata.document_title",
match=rest.MatchAny(any=[file.name])
)
]
),
'k': 15,
}
)
# Create prompt
rag_prompt = ChatPromptTemplate.from_template(prompts.rag_prompt_template)
llm = default_llm
rag_chain = (
{"context": itemgetter("question") | protocol_retriever, "question": itemgetter("question")}
| rag_prompt | llm | StrOutputParser()
)
from datetime import date
# Heading for top of ICF document
protocol_title = rag_chain.invoke({"question": "What is the exact title of this protocol? Only return the title itself without any other description."})
principal_investigator = rag_chain.invoke({"question":"What is the name of the principal investigator of the study? Only return the name itself without any other description."})
support = rag_chain.invoke({"question":"What agency is funding the study? Only return the name of the agency without any other description."})
version_date = date.today().strftime("%B %d, %Y")
msg = cl.Message(
content=f"""
**Study Title:** {protocol_title}
**Principal Investigator:** {principal_investigator}
**Version Date:** {version_date}
**Source of Support:** {support}
---
"""
)
await msg.send()
# Sending an action button within a chatbot message
actions = [
cl.Action(
name="summary_button",
icon="mouse-pointer-click",
payload={"value": "summary"},
label="Write summary"
),
cl.Action(
name="risk_button",
icon="mouse-pointer-click",
payload={"value": "risks"},
label="Write risk section"
),
cl.Action(
name="benefits_button",
icon="mouse-pointer-click",
payload={"value": "benefits"},
label="Write benefits section"
),
cl.Action(
name="file_button",
icon="mouse-pointer-click",
payload={"value": "markdown"},
label="Create final file"
)
]
await cl.Message(content="Select consent form sections:", actions=actions).send()
@cl.action_callback("summary_button")
async def on_action(action: cl.Action):
summary = rag_chain.invoke({"question":summary_query()})
await cl.Message(content=summary).send()
await cl.Message(content=f"Executed {action.payload["value"]}").send()
# await action.remove()
@cl.action_callback("risk_button")
async def on_action(action: cl.Action):
risks = rag_chain.invoke({"question":risks_query()})
await cl.Message(content=risks).send()
await cl.Message(content=f"Executed {action.payload["value"]}").send()
# await action.remove()
@cl.action_callback("benefits_button")
async def on_action(action: cl.Action):
benefits = rag_chain.invoke({"question":benefits_query()})
await cl.Message(content=benefits).send()
await cl.Message(content=f"Executed {action.payload["value"]}").send()
# await action.remove()
# @cl.action_callback("file_button")
# async def on_action(action: cl.Action):
# await cl.Message(content=f"Executed {action.payload["value"]}").send()
# await action.remove()
# # Now let's test the application to make a consent document
# start_time = time.time()
# # Brute force method that just saves each generated section as string
# summary = rag_chain.invoke({"question":summary_query()})
# background = rag_chain.invoke({"question":background_query()})
# number_of_participants = rag_chain.invoke({"question":number_of_participants_query()})
# study_procedures = rag_chain.invoke({"question":study_procedures_query()})
# alt_procedures = rag_chain.invoke({"question":alt_procedures_query()})
# risks = rag_chain.invoke({"question":risks_query()})
# benefits = rag_chain.invoke({"question":benefits_query()})
# end_time = time.time()
# execution_time = end_time - start_time
# msg = cl.Message(
# content=f"""
# Brute force (sequential) execution time: {execution_time:.2f} seconds.
# {summary}
# {background}
# {number_of_participants}
# {study_procedures}
# {alt_procedures}
# {risks}
# {benefits}
# """
# )
# await msg.send()