Mdean77 commited on
Commit
1f49ee0
·
1 Parent(s): 636c119
Files changed (12) hide show
  1. .dockerignore +10 -0
  2. .example.env +1 -0
  3. .python-version +1 -0
  4. Dockerfile +34 -0
  5. app.py +89 -0
  6. defaults.py +9 -0
  7. getVectorstore.py +88 -0
  8. makeMarkdown.py +20 -0
  9. prompts.py +15 -0
  10. pyproject.toml +24 -0
  11. queries.py +149 -0
  12. uv.lock +0 -0
.dockerignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ Clinical_Trial_Project.pages
2
+ .example.env
3
+ .venv
4
+ .chainlit
5
+ __pycache__
6
+ .gitignore
7
+ .gitattributes
8
+ appOLD.py
9
+ notebook.ipynb
10
+ langgraph.json
.example.env ADDED
@@ -0,0 +1 @@
 
 
1
+ OPENAI_API_KEY=sk-proj-etc
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.12
Dockerfile ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dockerfile for the Clinical Trial Project
2
+ # December 31, 2024
3
+ # Happy New Year!
4
+
5
+ # Get a distribution that has uv already installed
6
+ FROM ghcr.io/astral-sh/uv:python3.13-bookworm-slim
7
+
8
+ # Add user - this is the user that will run the app
9
+ # If you do not set user, the app will run as root (undesirable)
10
+ RUN useradd -m -u 1000 user
11
+ USER user
12
+
13
+ # Set the home directory and path
14
+ ENV HOME=/home/user \
15
+ PATH=/home/user/.local/bin:$PATH
16
+
17
+ # NEEDED FOR CHAINLIT IN HUGGING FACE SPACES
18
+ ENV UVICORN_WS_PROTOCOL=websockets
19
+
20
+ # Set the working directory
21
+ WORKDIR $HOME/app
22
+
23
+ # Copy the app to the container
24
+ COPY --chown=user . $HOME/app
25
+
26
+ # Install the dependencies
27
+ RUN uv sync --frozen
28
+ # RUN uv sync
29
+
30
+ # Expose the port
31
+ EXPOSE 7860
32
+
33
+ # Run the app
34
+ CMD ["uv", "run", "chainlit", "run", "app.py", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ load_dotenv()
5
+ OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
6
+
7
+ import chainlit as cl
8
+ import pymupdf
9
+ import tiktoken
10
+ from langchain_core.documents.base import Document
11
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
12
+
13
+ # def tiktoken_len(text):
14
+ # tokens = tiktoken.encoding_for_model("gpt-4o").encode(
15
+ # text,
16
+ # )
17
+ # return len(tokens)
18
+
19
+ @cl.on_chat_start
20
+ async def on_chat_start():
21
+ files = await cl.AskFileMessage(
22
+ content="Upload a file to proceed",
23
+ accept=["application/pdf"],
24
+ max_size_mb=50,
25
+ timeout=180,
26
+ ).send()
27
+
28
+ file = files[0]
29
+
30
+ doc = pymupdf.Document(file.path)
31
+ toc = doc.get_toc()
32
+ # Want to find the List Of Figures page because that is the last page I want to skip
33
+ for _, title, page in toc:
34
+ if title == "List of Figures":
35
+ print(f"{title} on page {page}")
36
+ start_page = page + 1
37
+
38
+ # get the last page I want included
39
+ for _, title, page in toc:
40
+ if ("References" in title) or ("Bibliography" in title):
41
+ print(f"{title} on page {page}")
42
+ end_page = page
43
+
44
+ print(f"Extraction should start on page {start_page} and end on page {end_page}")
45
+
46
+
47
+ # need a rect that will exclude headers and footers
48
+ rect = pymupdf.Rect(0.0, 100.0, 612.0, 650.0)
49
+
50
+ #create the final text
51
+ extracted_text = ""
52
+ for page in doc.pages():
53
+ if page.number in range(start_page-1, end_page):
54
+ # print(page.get_text(clip=rect))
55
+ extracted_text += page.get_text(clip=rect)
56
+ msg = cl.Message(
57
+ content=f"""Processing selected file: `{file.name}`...
58
+ Extraction beginning on page {start_page} and ending on page {end_page}.
59
+ Using a clipping rectangle to exclude headers and footers ({rect}).
60
+ Processed {end_page - start_page} pages of PDF document.
61
+ Length of extracted text string is {len(extracted_text)}
62
+ """
63
+ )
64
+ await msg.send()
65
+
66
+ chunk_size = 2000
67
+ chunk_overlap = 200
68
+
69
+ text_splitter = RecursiveCharacterTextSplitter(
70
+ chunk_size=chunk_size,
71
+ chunk_overlap = chunk_overlap,
72
+ # length_function = tiktoken_len
73
+ )
74
+
75
+ text_chunks = text_splitter.split_text(extracted_text)
76
+ # print(f"Number of chunks: {len(text_chunks)} ")
77
+ document = [Document(page_content=chunk) for chunk in text_chunks]
78
+ # print(f"Length of document: {len(document)}")
79
+
80
+ msg = cl.Message(
81
+ content=f"""Splitting the text with a recursive character splitter.
82
+ Set chunk size at {chunk_size} and overlap at {chunk_overlap}.
83
+ Number of resulting chunks: {len(text_chunks)}.
84
+ Document created from chunks to get stored in vector database.
85
+ Length of the document: {len(document)} (should be same as number of chunks).
86
+ """
87
+ )
88
+
89
+ await msg.send()
defaults.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # Defaults
2
+ from langchain_openai.embeddings import OpenAIEmbeddings
3
+ from langchain_openai.chat_models import ChatOpenAI
4
+ import os
5
+ OPENAI_API_KEY = os.getenv("NEWKEY")
6
+ default_embedding_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=OPENAI_API_KEY)
7
+ default_location = ":memory:"
8
+ default_url = "http://localhost:6333"
9
+ default_llm = ChatOpenAI(model="gpt-4o", api_key=OPENAI_API_KEY, streaming=True, temperature=0)
getVectorstore.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from qdrant_client import QdrantClient
2
+ from qdrant_client.http import models as rest
3
+ from langchain_qdrant import QdrantVectorStore
4
+ import hashlib
5
+ import defaults
6
+
7
+ embedding_model = defaults.default_embedding_model
8
+ qdrant_url = defaults.default_url
9
+
10
+ """
11
+ This code creates a hash for every chunk and checks to see if that chunk already exists in the
12
+ vector database. We only want one collection in Qdrant, but want to make sure that if a user
13
+ selects a document that has already been embedded and stored, it does not get stored again. We
14
+ also add metadata for the document title, so that we can make our retriever focus on documents of
15
+ interest. For example, after some usage, the application might have 20 documents for the user to
16
+ select from. We want the retriever to be exactly right for the documents that they selected.
17
+
18
+ This could also be useful if different versions of documents are in existence. We would not want to
19
+ recreate a large vectorstore. But the user could select the most recent version.
20
+ """
21
+
22
+
23
+ def get_document_hash(doc_content):
24
+ """Generate a unique hash for the document content."""
25
+ return hashlib.md5(doc_content.encode()).hexdigest()
26
+
27
+ def getVectorstore(document, file_path):
28
+ # Add a unique hash to your documents
29
+ for doc in document:
30
+ doc.metadata['content_hash'] = get_document_hash(doc.page_content)
31
+
32
+ # Add the document title
33
+ for doc in document:
34
+ doc.metadata['document_title'] = file_path.split('/')[-1]
35
+
36
+ # client = QdrantClient(url=qdrant_url)
37
+ client = QdrantClient(":memory:")
38
+ # If the collection exists, then we need to check to see if our document is already
39
+ # present, in which case we would not want to store it again.
40
+ if client.collection_exists("protocol_collection"):
41
+ print("Collection exists")
42
+ qdrant_vectorstore = QdrantVectorStore.from_existing_collection(
43
+ embedding=embedding_model,
44
+ collection_name="protocol_collection",
45
+ # url=qdrant_url
46
+ location = ":memory:"
47
+ )
48
+
49
+ # Check for existing documents and only add new ones
50
+ existing_hashes = set()
51
+ new_docs = []
52
+
53
+ # Get all existing hashes
54
+ scroll_filter = rest.Filter(
55
+ should=[
56
+ rest.FieldCondition(
57
+ key="metadata.content_hash",
58
+ match=rest.MatchValue(value=doc.metadata['content_hash'])
59
+ ) for doc in document
60
+ ]
61
+ )
62
+
63
+ scroll_results = client.scroll(
64
+ collection_name="protocol_collection",
65
+ scroll_filter=scroll_filter,
66
+ limit=len(document) # Adjust this if you have a large number of documents
67
+ )
68
+
69
+ existing_hashes = set(point.payload.get('metadata', {}).get('content_hash') for point in scroll_results[0])
70
+
71
+ for doc in document:
72
+ if doc.metadata['content_hash'] not in existing_hashes:
73
+ new_docs.append(doc)
74
+
75
+ if new_docs:
76
+ qdrant_vectorstore.add_documents(new_docs)
77
+
78
+ print(f"Added {len(new_docs)} new documents")
79
+ print(f"Skipped {len(existing_hashes)} existing documents")
80
+ else:
81
+ print("Collection does not exist") #So we go ahead and just add the documents
82
+ qdrant_vectorstore = QdrantVectorStore.from_documents(
83
+ documents=document,
84
+ embedding=embedding_model,
85
+ collection_name="protocol_collection",
86
+ location = ":memory:"
87
+ )
88
+ return qdrant_vectorstore
makeMarkdown.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from IPython.display import display, Markdown
2
+ import os
3
+
4
+ """
5
+ This procedure writes the full Markdown document to disk.
6
+ """
7
+
8
+ def makeMarkdown(markdownFile, title):
9
+ # Combine all summaries into one string
10
+ combined_markdown = "\n\n---\n\n".join(markdownFile)
11
+
12
+ # Add a title to the combined document
13
+ full_document = combined_markdown
14
+
15
+ # Save the Markdown content to a file
16
+ file_path = f'{title}.md'
17
+ with open(file_path, 'w', encoding='utf-8') as f:
18
+ f.write(full_document)
19
+
20
+ print(f"Markdown document has been created: {os.path.abspath(file_path)}")
prompts.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # File to contain all my prompts and welcome screens, etc.
2
+
3
+ rag_prompt_template = """\
4
+ You are a helpful and polite and cheerful assistant who answers questions based solely on the provided context.
5
+ Use the context to answer the question and provide a clear answer. Do not mention the document in your
6
+ response.
7
+ If there is no specific information
8
+ relevant to the question, then tell the user that you can't answer based on the context.
9
+
10
+ Context:
11
+ {context}
12
+
13
+ Question:
14
+ {question}
15
+ """
pyproject.toml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "informed-consent"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ dependencies = [
8
+ "langchain-qdrant>=0.2.0",
9
+ "langchain>=0.3.13",
10
+ "langchain-community>=0.3.13",
11
+ "pydantic==2.10.1",
12
+ "qdrant-client>=1.12.2",
13
+ "tiktoken>=0.8.0",
14
+ "langchain-openai>=0.2.14",
15
+ "langgraph>=0.2.60",
16
+ "pymupdf>=1.25.1",
17
+ "chainlit>=1.1.202",
18
+ "rich>=13.9.4",
19
+ ]
20
+
21
+ [dependency-groups]
22
+ dev = [
23
+ "ipykernel>=6.29.5",
24
+ ]
queries.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def summary_query():
2
+ summary_query = """
3
+ Please write a summary of the protocol that can be used as the introduction to an informed
4
+ consent document for a patient to participate in the study described in the protocol. This
5
+ summary should be between 500 and 1000 words in length, and should be understandable by a normally
6
+ intelligent adult patient. You must explain any medical or technical terms in a way
7
+ that a high school graduate can understand. The summary should briefly explain the study, the rationale for the study,
8
+ the risks and benefits of participation, and that participation is entirely voluntary.
9
+
10
+ Start the summary with a level 2 Markdown header (##) titled "Study Summary". Then continue with
11
+ the content without any further subheadings. Produce the entire summary in Markdown format so it
12
+ can be nicely printed for the reader.
13
+
14
+ You should assume that the consent form is addressed to the parent of a potentially eligible
15
+ child, and the very beginnning of the summary should indicate that they are being invited to allow
16
+ their child to participate because the child is potentially eligible. State why their child is potentially
17
+ eligible.
18
+
19
+ All the details of this introductory summary should be specific for this protocol.
20
+
21
+ """
22
+ return summary_query
23
+
24
+ def background_query():
25
+
26
+ background_query = """
27
+ Please write a summary of the protocol that can be used as the background section of an informed
28
+ consent document for a patient to participate in the study described in the protocol. This
29
+ summary should be between 500 and 1000 words in length, and should be understandable by a normally
30
+ intelligent adult patient. You must explain any medical or technical terms in a way
31
+ that a high school graduate can understand. The summary should briefly explain why this patient is being
32
+ approached to be in the study, including a brief description of the disease that is being studied in the
33
+ protocol, a description of the study interventions, and the scientific reasons that the investigators believe
34
+ the intervention might help the patient.
35
+
36
+ Do not include the specific study procedures in this summary, because this will be presented in a different section of
37
+ the informed consent document. You also do not need to mention that participation is voluntary, nor
38
+ the specific risks and benefits of the study, because this information is being presented in a different
39
+ part of the informed consent document.
40
+
41
+ Start the summary with a level 2 Markdown header (##) titled "Background". Then continue with
42
+ the content without any further subheadings. Produce the entire summary in Markdown format so it
43
+ can be nicely printed for the reader.
44
+
45
+ All the details of this background summary should be specific for this protocol.
46
+
47
+ """
48
+ return background_query
49
+
50
+ def number_of_participants_query():
51
+
52
+ number_of_participants_query = """
53
+ Please write a summary of the protocol that can be used
54
+ for the "number of participants" section of the informed consent document. This should include where the
55
+ study is being conducted (for example, at this hospital, or in a network, or in multiple hospitals), the funding source
56
+ (often the NIH), the total number of participants that are planned to be enrolled in the study,
57
+ and the total period of the time that the study is expected to enroll subjects. This summary should not require more than 200 words.
58
+
59
+ Start the summary with a level 2 Markdown header (##) titled "Number of Participants". Then continue with
60
+ the content without any further subheadings. Produce the entire summary in Markdown format so it
61
+ can be nicely printed for the reader.
62
+
63
+ All the details of this number of participants summary should be specific for this protocol.
64
+
65
+ """
66
+ return number_of_participants_query
67
+
68
+ def study_procedures_query():
69
+ study_procedures_query = """
70
+ Please write a detailed summary of all the study procedures that will be carried out in this protocol. This will
71
+ be used for the "study procedures" section of the informed consent document that the patient will read when deciding
72
+ whether to participate in the study, so it is important that all significant procedures be included.
73
+ Make sure that everything will be understandable to the reader, who is not trained in medicine. Do not write
74
+ the summary as if it is in third person - write it like you are speaking directly to the patient (i.e. use "you" instead
75
+ of the "patient", with correct grammar of course.) Do not include a welcome to the study, or discussion about
76
+ participation being voluntary, as that information is in a different part of the consent document. Do not include
77
+ risks and benefits as these are presented in a different part of the consent document. Please be detailed, as it is
78
+ important that the patient understand each procedure.
79
+ The length of this summary is usually
80
+ 2000 to 3000 words.
81
+
82
+ Start the summary with a level 2 Markdown header (##) titled "Study Procedures", and then continue the section with subheadings
83
+ that will help organize the information for the reader. Do not go more than two subheadings deep.
84
+
85
+ All the details of study procedures should be specific to this protocol.
86
+
87
+ """
88
+ return study_procedures_query
89
+
90
+ def alt_procedures_query():
91
+ alt_procedures_query = """
92
+ Please write a summary of alternatives to participation in this study. An example is:
93
+
94
+ " Your participation in this study is voluntary. It is not necessary to be in this study to get care for
95
+ your illness. Monitoring of immune function is not currently done as part of routine ICU care. There are no
96
+ other treatments designed to increase immune function of reduce inflammation that are routinely used in
97
+ children with sepsis."
98
+
99
+ Note that this example is purely an example, and your summary must be specific to the protocol. The summary should
100
+ be easily understandable by medically untrained readers. This section is usually less than 500 words in length.
101
+
102
+
103
+ Start the summary with a level 2 Markdown header (##) titled "Alternative Procedures", and then continue with
104
+ the content without any further subheadings. Produce the entire summary in Markdown format so it
105
+ can be nicely printed for the reader.
106
+
107
+
108
+ """
109
+ return alt_procedures_query
110
+
111
+ def risks_query():
112
+ risks_query = """
113
+ Please write a detailed summary of the risks of participating in the study. This will be used for the
114
+ "Risks" section of the informed consent document. It is important that all significant risks of study
115
+ participation are described in detail. The summary must be easily readable by untrained readers, so provide
116
+ definitions of technical or medical terms. Address all the risks by speaking to the patient, not by passively
117
+ mentioning risks to "the patient". Especially include risks that are associated with the study interventions such
118
+ as drugs or devices, or associated with testing that is done as part of the study. Also include
119
+ the risks associated with data collection, and also mention "unforeseable risks".
120
+
121
+ The length of this risk summary is usually
122
+ 2000 to 3000 words.
123
+
124
+ Start the summary with a level 2 Markdown header (##) titled "Risks", and then continue the section with subheadings
125
+ that will help organize the information for the reader. Do not go more than two subheadings deep.
126
+
127
+ All the details of study risks should be specific to this protocol.
128
+
129
+ """
130
+ return risks_query
131
+
132
+ def benefits_query():
133
+ benefits_query = """
134
+ Please write a summary of the potential benefits of participating in the study. This will be used for the
135
+ "Benefits" section of the informed consent document. The summary should include potential benefits for the patient
136
+ (addressed as "you"), and potential benefits for others. Since this is a research study and it is
137
+ not known if the intervention is helpful, it is important to not overstate potential benefits for the patient.
138
+
139
+ The length of this risk summary is usually
140
+ 500 to 750 words.
141
+
142
+ Start the summary with a level 2 Markdown header (##) titled "Benefits", and then continue
143
+ with a subheading for "Potential
144
+ Benefits for You" and another subheading for "Potential Benefits for Others".
145
+
146
+ All the information of study benefits should be specific to this protocol.
147
+
148
+ """
149
+ return benefits_query
uv.lock ADDED
The diff for this file is too large to render. See raw diff