File size: 8,736 Bytes
1f49ee0
 
8f97916
1f49ee0
 
 
 
 
 
 
 
2edf2fb
 
 
 
 
 
 
 
 
a841fcc
8f97916
 
 
 
 
 
 
2edf2fb
 
1f49ee0
 
 
 
 
 
 
 
 
 
 
5e4b78a
1f49ee0
 
 
 
004e22c
 
1f49ee0
 
 
 
 
004e22c
1f49ee0
004e22c
 
1f49ee0
 
 
 
 
004e22c
1f49ee0
 
 
 
 
 
5e4b78a
1f49ee0
5e4b78a
 
1f49ee0
1be02f9
5e4b78a
 
1f49ee0
 
5e4b78a
 
1f49ee0
 
 
 
 
 
 
 
 
 
2a03ddd
1f49ee0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2edf2fb
 
5e4b78a
2edf2fb
a841fcc
 
 
 
2edf2fb
 
 
5e4b78a
2edf2fb
 
5e4b78a
2edf2fb
 
 
5e4b78a
2edf2fb
 
5e4b78a
2edf2fb
 
 
 
 
 
 
 
 
5e4b78a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a841fcc
2a03ddd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a841fcc
2a03ddd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
import os
from dotenv import load_dotenv
import time
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

import chainlit as cl
import pymupdf
import tiktoken
from langchain_core.documents.base import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import getVectorstore
from getVectorstore import getVectorstore
from qdrant_client.http import models as rest
from langchain.prompts import ChatPromptTemplate
import prompts
from prompts import rag_prompt_template
from defaults import default_llm
from operator import itemgetter
from langchain.schema.output_parser import StrOutputParser
from datetime import date
from queries import summary_query
from queries import background_query
from queries import number_of_participants_query
from queries import study_procedures_query
from queries import alt_procedures_query
from queries import risks_query
from queries import benefits_query



@cl.on_chat_start
async def on_chat_start():
    files = await cl.AskFileMessage(
        content="Upload a file to proceed",
        accept=["application/pdf"],
        max_size_mb=50,
        timeout=180,
    ).send()

    file = files[0]
    print(f"filename is {file.name}")

    doc = pymupdf.Document(file.path)
    toc = doc.get_toc()
    # Want to find the List Of Figures page because that is the last page I want to skip
    # Default is 1 if I do not find better start location
    start_page = 1
    for _, title, page in toc:
        if title == "List of Figures":
            print(f"{title} on page {page}")
            start_page = page + 1


    # get the last page I want included
    # default is last page of document
    end_page = len(doc)
    for _, title, page in toc:
        if ("References" in title) or ("Bibliography" in title):
            print(f"{title} on page {page}")
            end_page = page


    print(f"Extraction should start on page {start_page} and end on page {end_page}")


    # need a rect that will exclude headers and footers
    rect = pymupdf.Rect(0.0, 100.0, 612.0, 650.0)

    #capture the first 2 page
    extracted_text = ""


    for page in doc.pages():
        if (start_page != 1 and page.number in [0, 1, 2]):
            extracted_text += page.get_text()
        elif page.number in range(start_page-1, end_page):
            # print(page.get_text(clip=rect))
            extracted_text += page.get_text(clip=rect)


    msg = cl.Message(
        content=f"""Processing selected file: `{file.name}`...
        Extraction beginning on page {start_page} and ending on page {end_page}.
        Using a clipping rectangle to exclude headers and footers ({rect}).
        Processed {end_page - start_page} pages of PDF document.
        Length of extracted text string is {len(extracted_text)}
        """
    )
    await msg.send()

    chunk_size = 3000
    chunk_overlap = 200

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap = chunk_overlap,
        # length_function = tiktoken_len
    )

    text_chunks = text_splitter.split_text(extracted_text)
    document = [Document(page_content=chunk) for chunk in text_chunks]

    msg = cl.Message(
        content=f"""Splitting the text with a recursive character splitter.
        Set chunk size at {chunk_size} and overlap at {chunk_overlap}.
        Number of resulting chunks: {len(text_chunks)}.
        Document created from chunks to get stored in vector database.
        Length of the document: {len(document)} (should be same as number of chunks).
        """
    )

    await msg.send()

    qdrant_vectorstore = getVectorstore(document, file.name)

    # My vectorstore may have multiple protocols or documents that have been stored and persisted.
    # But I only want the context of the current session to relate to a document that I just processed
    # so I need to pass in the title of the document.  This will act as a filter for the retrieved
    # chunks.
    protocol_retriever = qdrant_vectorstore.as_retriever(
        search_kwargs={
            'filter': rest.Filter(
                 must=[
                    rest.FieldCondition(
                        key="metadata.document_title",
                         match=rest.MatchAny(any=[file.name])
                    )
                ]
            ),
            'k': 15,                                       
        }
    )
 
    # Create prompt
    rag_prompt = ChatPromptTemplate.from_template(prompts.rag_prompt_template)

    llm = default_llm

    rag_chain = (
        {"context": itemgetter("question") | protocol_retriever, "question": itemgetter("question")}
        | rag_prompt | llm | StrOutputParser()
    )

    from datetime import date
    # Heading for top of ICF document
    protocol_title = rag_chain.invoke({"question": "What is the exact title of this protocol?  Only return the title itself without any other description."})
    principal_investigator = rag_chain.invoke({"question":"What is the name of the principal investigator of the study?  Only return the name itself without any other description."})
    support = rag_chain.invoke({"question":"What agency is funding the study?  Only return the name of the agency without any other description."})
    version_date = date.today().strftime("%B %d, %Y")

    msg = cl.Message(
        content=f""" 
        **Study Title:** {protocol_title}
        **Principal Investigator:** {principal_investigator}
        **Version Date:** {version_date}
        **Source of Support:** {support}
        ---
        """
    )

    await msg.send()
    
        # Sending an action button within a chatbot message
    actions = [
        cl.Action(
            name="summary_button",
            icon="mouse-pointer-click",
            payload={"value": "summary"},
            label="Write summary"
        ),
        cl.Action(
            name="risk_button",
            icon="mouse-pointer-click",
            payload={"value": "risks"},
            label="Write risk section"
        ),
        cl.Action(
            name="benefits_button",
            icon="mouse-pointer-click",
            payload={"value": "benefits"},
            label="Write benefits section"
        ),
        cl.Action(
            name="file_button",
            icon="mouse-pointer-click",
            payload={"value": "markdown"},
            label="Create final file"
        )
    ]
    await cl.Message(content="Select consent form sections:", actions=actions).send()

    @cl.action_callback("summary_button")
    async def on_action(action: cl.Action):
        summary = rag_chain.invoke({"question":summary_query()})
        await cl.Message(content=summary).send()
        await cl.Message(content=f"Executed {action.payload["value"]}").send()
        # await action.remove()

    @cl.action_callback("risk_button")
    async def on_action(action: cl.Action):
        risks = rag_chain.invoke({"question":risks_query()})
        await cl.Message(content=risks).send()
        await cl.Message(content=f"Executed {action.payload["value"]}").send()
        # await action.remove()

    @cl.action_callback("benefits_button")
    async def on_action(action: cl.Action):
        benefits = rag_chain.invoke({"question":benefits_query()})
        await cl.Message(content=benefits).send()
        await cl.Message(content=f"Executed {action.payload["value"]}").send()
        # await action.remove()

    # @cl.action_callback("file_button")
    # async def on_action(action: cl.Action):
    #     await cl.Message(content=f"Executed {action.payload["value"]}").send()
    #     await action.remove()


    # # Now let's test the application to make a consent document
    # start_time = time.time()
    # # Brute force method that just saves each generated section as string
    # summary = rag_chain.invoke({"question":summary_query()})
    # background = rag_chain.invoke({"question":background_query()})
    # number_of_participants = rag_chain.invoke({"question":number_of_participants_query()})
    # study_procedures = rag_chain.invoke({"question":study_procedures_query()})
    # alt_procedures = rag_chain.invoke({"question":alt_procedures_query()})
    # risks = rag_chain.invoke({"question":risks_query()})
    # benefits = rag_chain.invoke({"question":benefits_query()})

    # end_time = time.time()
    # execution_time = end_time - start_time

    # msg = cl.Message(
    #     content=f"""
    #     Brute force (sequential) execution time: {execution_time:.2f} seconds.
    #     {summary}
    #     {background}  
    #     {number_of_participants} 
    #     {study_procedures}
    #     {alt_procedures}
    #     {risks}
    #     {benefits}
    #     """

    # )
   
    # await msg.send()