Spaces:
Runtime error
Runtime error
File size: 8,736 Bytes
1f49ee0 8f97916 1f49ee0 2edf2fb a841fcc 8f97916 2edf2fb 1f49ee0 5e4b78a 1f49ee0 004e22c 1f49ee0 004e22c 1f49ee0 004e22c 1f49ee0 004e22c 1f49ee0 5e4b78a 1f49ee0 5e4b78a 1f49ee0 1be02f9 5e4b78a 1f49ee0 5e4b78a 1f49ee0 2a03ddd 1f49ee0 2edf2fb 5e4b78a 2edf2fb a841fcc 2edf2fb 5e4b78a 2edf2fb 5e4b78a 2edf2fb 5e4b78a 2edf2fb 5e4b78a 2edf2fb 5e4b78a a841fcc 2a03ddd a841fcc 2a03ddd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 |
import os
from dotenv import load_dotenv
import time
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
import chainlit as cl
import pymupdf
import tiktoken
from langchain_core.documents.base import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import getVectorstore
from getVectorstore import getVectorstore
from qdrant_client.http import models as rest
from langchain.prompts import ChatPromptTemplate
import prompts
from prompts import rag_prompt_template
from defaults import default_llm
from operator import itemgetter
from langchain.schema.output_parser import StrOutputParser
from datetime import date
from queries import summary_query
from queries import background_query
from queries import number_of_participants_query
from queries import study_procedures_query
from queries import alt_procedures_query
from queries import risks_query
from queries import benefits_query
@cl.on_chat_start
async def on_chat_start():
files = await cl.AskFileMessage(
content="Upload a file to proceed",
accept=["application/pdf"],
max_size_mb=50,
timeout=180,
).send()
file = files[0]
print(f"filename is {file.name}")
doc = pymupdf.Document(file.path)
toc = doc.get_toc()
# Want to find the List Of Figures page because that is the last page I want to skip
# Default is 1 if I do not find better start location
start_page = 1
for _, title, page in toc:
if title == "List of Figures":
print(f"{title} on page {page}")
start_page = page + 1
# get the last page I want included
# default is last page of document
end_page = len(doc)
for _, title, page in toc:
if ("References" in title) or ("Bibliography" in title):
print(f"{title} on page {page}")
end_page = page
print(f"Extraction should start on page {start_page} and end on page {end_page}")
# need a rect that will exclude headers and footers
rect = pymupdf.Rect(0.0, 100.0, 612.0, 650.0)
#capture the first 2 page
extracted_text = ""
for page in doc.pages():
if (start_page != 1 and page.number in [0, 1, 2]):
extracted_text += page.get_text()
elif page.number in range(start_page-1, end_page):
# print(page.get_text(clip=rect))
extracted_text += page.get_text(clip=rect)
msg = cl.Message(
content=f"""Processing selected file: `{file.name}`...
Extraction beginning on page {start_page} and ending on page {end_page}.
Using a clipping rectangle to exclude headers and footers ({rect}).
Processed {end_page - start_page} pages of PDF document.
Length of extracted text string is {len(extracted_text)}
"""
)
await msg.send()
chunk_size = 3000
chunk_overlap = 200
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap = chunk_overlap,
# length_function = tiktoken_len
)
text_chunks = text_splitter.split_text(extracted_text)
document = [Document(page_content=chunk) for chunk in text_chunks]
msg = cl.Message(
content=f"""Splitting the text with a recursive character splitter.
Set chunk size at {chunk_size} and overlap at {chunk_overlap}.
Number of resulting chunks: {len(text_chunks)}.
Document created from chunks to get stored in vector database.
Length of the document: {len(document)} (should be same as number of chunks).
"""
)
await msg.send()
qdrant_vectorstore = getVectorstore(document, file.name)
# My vectorstore may have multiple protocols or documents that have been stored and persisted.
# But I only want the context of the current session to relate to a document that I just processed
# so I need to pass in the title of the document. This will act as a filter for the retrieved
# chunks.
protocol_retriever = qdrant_vectorstore.as_retriever(
search_kwargs={
'filter': rest.Filter(
must=[
rest.FieldCondition(
key="metadata.document_title",
match=rest.MatchAny(any=[file.name])
)
]
),
'k': 15,
}
)
# Create prompt
rag_prompt = ChatPromptTemplate.from_template(prompts.rag_prompt_template)
llm = default_llm
rag_chain = (
{"context": itemgetter("question") | protocol_retriever, "question": itemgetter("question")}
| rag_prompt | llm | StrOutputParser()
)
from datetime import date
# Heading for top of ICF document
protocol_title = rag_chain.invoke({"question": "What is the exact title of this protocol? Only return the title itself without any other description."})
principal_investigator = rag_chain.invoke({"question":"What is the name of the principal investigator of the study? Only return the name itself without any other description."})
support = rag_chain.invoke({"question":"What agency is funding the study? Only return the name of the agency without any other description."})
version_date = date.today().strftime("%B %d, %Y")
msg = cl.Message(
content=f"""
**Study Title:** {protocol_title}
**Principal Investigator:** {principal_investigator}
**Version Date:** {version_date}
**Source of Support:** {support}
---
"""
)
await msg.send()
# Sending an action button within a chatbot message
actions = [
cl.Action(
name="summary_button",
icon="mouse-pointer-click",
payload={"value": "summary"},
label="Write summary"
),
cl.Action(
name="risk_button",
icon="mouse-pointer-click",
payload={"value": "risks"},
label="Write risk section"
),
cl.Action(
name="benefits_button",
icon="mouse-pointer-click",
payload={"value": "benefits"},
label="Write benefits section"
),
cl.Action(
name="file_button",
icon="mouse-pointer-click",
payload={"value": "markdown"},
label="Create final file"
)
]
await cl.Message(content="Select consent form sections:", actions=actions).send()
@cl.action_callback("summary_button")
async def on_action(action: cl.Action):
summary = rag_chain.invoke({"question":summary_query()})
await cl.Message(content=summary).send()
await cl.Message(content=f"Executed {action.payload["value"]}").send()
# await action.remove()
@cl.action_callback("risk_button")
async def on_action(action: cl.Action):
risks = rag_chain.invoke({"question":risks_query()})
await cl.Message(content=risks).send()
await cl.Message(content=f"Executed {action.payload["value"]}").send()
# await action.remove()
@cl.action_callback("benefits_button")
async def on_action(action: cl.Action):
benefits = rag_chain.invoke({"question":benefits_query()})
await cl.Message(content=benefits).send()
await cl.Message(content=f"Executed {action.payload["value"]}").send()
# await action.remove()
# @cl.action_callback("file_button")
# async def on_action(action: cl.Action):
# await cl.Message(content=f"Executed {action.payload["value"]}").send()
# await action.remove()
# # Now let's test the application to make a consent document
# start_time = time.time()
# # Brute force method that just saves each generated section as string
# summary = rag_chain.invoke({"question":summary_query()})
# background = rag_chain.invoke({"question":background_query()})
# number_of_participants = rag_chain.invoke({"question":number_of_participants_query()})
# study_procedures = rag_chain.invoke({"question":study_procedures_query()})
# alt_procedures = rag_chain.invoke({"question":alt_procedures_query()})
# risks = rag_chain.invoke({"question":risks_query()})
# benefits = rag_chain.invoke({"question":benefits_query()})
# end_time = time.time()
# execution_time = end_time - start_time
# msg = cl.Message(
# content=f"""
# Brute force (sequential) execution time: {execution_time:.2f} seconds.
# {summary}
# {background}
# {number_of_participants}
# {study_procedures}
# {alt_procedures}
# {risks}
# {benefits}
# """
# )
# await msg.send() |