Spaces:

Mdean77
/

Informed_Consent

Runtime error

App Files Files Community

Mdean77 commited on Jan 4

Commit

1f49ee0

1 Parent(s): 636c119

my files

Browse files

Files changed (12) hide show

.dockerignore +10 -0
.example.env +1 -0
.python-version +1 -0
Dockerfile +34 -0
app.py +89 -0
defaults.py +9 -0
getVectorstore.py +88 -0
makeMarkdown.py +20 -0
prompts.py +15 -0
pyproject.toml +24 -0
queries.py +149 -0
uv.lock +0 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,10 @@

+Clinical_Trial_Project.pages
+.example.env
+.venv
+.chainlit
+__pycache__
+.gitignore
+.gitattributes
+appOLD.py
+notebook.ipynb
+langgraph.json

.example.env ADDED Viewed

	@@ -0,0 +1 @@


1	+ OPENAI_API_KEY=sk-proj-etc

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.12

Dockerfile ADDED Viewed

	@@ -0,0 +1,34 @@

+# Dockerfile for the Clinical Trial Project
+# December 31, 2024
+# Happy New Year!
+# Get a distribution that has uv already installed
+FROM ghcr.io/astral-sh/uv:python3.13-bookworm-slim
+# Add user - this is the user that will run the app
+# If you do not set user, the app will run as root (undesirable)
+RUN useradd -m -u 1000 user
+USER user
+# Set the home directory and path
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+# NEEDED FOR CHAINLIT IN HUGGING FACE SPACES
+ENV UVICORN_WS_PROTOCOL=websockets
+# Set the working directory
+WORKDIR $HOME/app
+# Copy the app to the container
+COPY --chown=user . $HOME/app
+# Install the dependencies
+RUN uv sync --frozen
+# RUN uv sync
+# Expose the port
+EXPOSE 7860
+# Run the app
+CMD ["uv", "run", "chainlit", "run", "app.py", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import os
+from dotenv import load_dotenv
+load_dotenv()
+OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
+import chainlit as cl
+import pymupdf
+import tiktoken
+from langchain_core.documents.base import Document
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+# def tiktoken_len(text):
+#     tokens = tiktoken.encoding_for_model("gpt-4o").encode(
+#         text,
+#     )
+#     return len(tokens)
+@cl.on_chat_start
+async def on_chat_start():
+    files = await cl.AskFileMessage(
+        content="Upload a file to proceed",
+        accept=["application/pdf"],
+        max_size_mb=50,
+        timeout=180,
+    ).send()
+    file = files[0]
+    doc = pymupdf.Document(file.path)
+    toc = doc.get_toc()
+    # Want to find the List Of Figures page because that is the last page I want to skip
+    for _, title, page in toc:
+        if title == "List of Figures":
+            print(f"{title} on page {page}")
+            start_page = page + 1
+    # get the last page I want included
+    for _, title, page in toc:
+        if ("References" in title) or ("Bibliography" in title):
+            print(f"{title} on page {page}")
+            end_page = page
+    print(f"Extraction should start on page {start_page} and end on page {end_page}")
+    # need a rect that will exclude headers and footers
+    rect = pymupdf.Rect(0.0, 100.0, 612.0, 650.0)
+    #create the final text
+    extracted_text = ""
+    for page in doc.pages():
+        if page.number in range(start_page-1, end_page):
+            # print(page.get_text(clip=rect))
+            extracted_text += page.get_text(clip=rect)
+    msg = cl.Message(
+        content=f"""Processing selected file: `{file.name}`...
+        Extraction beginning on page {start_page} and ending on page {end_page}.
+        Using a clipping rectangle to exclude headers and footers ({rect}).
+        Processed {end_page - start_page} pages of PDF document.
+        Length of extracted text string is {len(extracted_text)}
+        """
+    )
+    await msg.send()
+    chunk_size = 2000
+    chunk_overlap = 200
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap = chunk_overlap,
+        # length_function = tiktoken_len
+    )
+    text_chunks = text_splitter.split_text(extracted_text)
+    # print(f"Number of chunks: {len(text_chunks)} ")
+    document = [Document(page_content=chunk) for chunk in text_chunks]
+    # print(f"Length of  document: {len(document)}")
+    msg = cl.Message(
+        content=f"""Splitting the text with a recursive character splitter.
+        Set chunk size at {chunk_size} and overlap at {chunk_overlap}.
+        Number of resulting chunks: {len(text_chunks)}.
+        Document created from chunks to get stored in vector database.
+        Length of the document: {len(document)} (should be same as number of chunks).
+        """
+    )
+    await msg.send()

defaults.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# Defaults
+from langchain_openai.embeddings import OpenAIEmbeddings
+from langchain_openai.chat_models import ChatOpenAI
+import os
+OPENAI_API_KEY = os.getenv("NEWKEY")
+default_embedding_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=OPENAI_API_KEY)
+default_location = ":memory:"
+default_url = "http://localhost:6333"
+default_llm = ChatOpenAI(model="gpt-4o", api_key=OPENAI_API_KEY, streaming=True, temperature=0)

getVectorstore.py ADDED Viewed

	@@ -0,0 +1,88 @@

+from qdrant_client import QdrantClient
+from qdrant_client.http import models as rest
+from langchain_qdrant import QdrantVectorStore
+import hashlib
+import defaults
+embedding_model = defaults.default_embedding_model
+qdrant_url = defaults.default_url
+"""
+This code creates a hash for every chunk and checks to see if that chunk already exists in the
+vector database.  We only want one collection in Qdrant, but want to make sure that if a user
+selects a document that has already been embedded and stored, it does not get stored again.  We
+also add metadata for the document title, so that we can make our retriever focus on documents of
+interest.  For example, after some usage, the application might have 20 documents for the user to
+select from.  We want the retriever to be exactly right for the documents that they selected.
+This could also be useful if different versions of documents are in existence.  We would not want to
+recreate a large vectorstore.  But the user could select the most recent version.
+"""
+def get_document_hash(doc_content):
+    """Generate a unique hash for the document content."""
+    return hashlib.md5(doc_content.encode()).hexdigest()
+def getVectorstore(document, file_path):
+    # Add a unique hash to your documents
+    for doc in document:
+        doc.metadata['content_hash'] = get_document_hash(doc.page_content)
+    # Add the document title
+    for doc in document:
+        doc.metadata['document_title'] = file_path.split('/')[-1]
+    # client = QdrantClient(url=qdrant_url)
+    client = QdrantClient(":memory:")
+    # If the collection exists, then we need to check to see if our document is already
+    # present, in which case we would not want to store it again.
+    if client.collection_exists("protocol_collection"):
+        print("Collection exists")
+        qdrant_vectorstore = QdrantVectorStore.from_existing_collection(
+            embedding=embedding_model,
+            collection_name="protocol_collection",
+            # url=qdrant_url
+            location = ":memory:"
+        )
+        # Check for existing documents and only add new ones
+        existing_hashes = set()
+        new_docs = []
+        # Get all existing hashes
+        scroll_filter = rest.Filter(
+            should=[
+                rest.FieldCondition(
+                    key="metadata.content_hash",
+                    match=rest.MatchValue(value=doc.metadata['content_hash'])
+                ) for doc in document
+            ]
+        )
+        scroll_results = client.scroll(
+            collection_name="protocol_collection",
+            scroll_filter=scroll_filter,
+            limit=len(document)  # Adjust this if you have a large number of documents
+        )
+        existing_hashes = set(point.payload.get('metadata', {}).get('content_hash') for point in scroll_results[0])
+        for doc in document:
+            if doc.metadata['content_hash'] not in existing_hashes:
+                new_docs.append(doc)
+        if new_docs:
+            qdrant_vectorstore.add_documents(new_docs)
+        print(f"Added {len(new_docs)} new documents")
+        print(f"Skipped {len(existing_hashes)} existing documents")
+    else:
+        print("Collection does not exist")                           #So we go ahead and just add the documents
+        qdrant_vectorstore = QdrantVectorStore.from_documents(
+            documents=document,
+            embedding=embedding_model,
+            collection_name="protocol_collection",
+            location = ":memory:"
+        )
+    return qdrant_vectorstore

makeMarkdown.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# from IPython.display import display, Markdown
+import os
+"""
+This procedure writes the full Markdown document to disk.
+"""
+def makeMarkdown(markdownFile, title):
+    # Combine all summaries into one string
+    combined_markdown = "\n\n---\n\n".join(markdownFile)
+    # Add a title to the combined document
+    full_document = combined_markdown
+    # Save the Markdown content to a file
+    file_path = f'{title}.md'
+    with open(file_path, 'w', encoding='utf-8') as f:
+        f.write(full_document)
+    print(f"Markdown document has been created: {os.path.abspath(file_path)}")

prompts.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# File to contain all my prompts and welcome screens, etc.
+rag_prompt_template = """\
+You are a helpful and polite and cheerful assistant who answers questions based solely on the provided context.
+Use the context to answer the question and provide a  clear answer. Do not mention the document in your
+response.
+If there is no specific information
+relevant to the question, then tell the user that you can't answer based on the context.
+Context:
+{context}
+Question:
+{question}
+"""

pyproject.toml ADDED Viewed

	@@ -0,0 +1,24 @@

+[project]
+name = "informed-consent"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "langchain-qdrant>=0.2.0",
+    "langchain>=0.3.13",
+    "langchain-community>=0.3.13",
+    "pydantic==2.10.1",
+    "qdrant-client>=1.12.2",
+    "tiktoken>=0.8.0",
+    "langchain-openai>=0.2.14",
+    "langgraph>=0.2.60",
+    "pymupdf>=1.25.1",
+    "chainlit>=1.1.202",
+    "rich>=13.9.4",
+]
+[dependency-groups]
+dev = [
+    "ipykernel>=6.29.5",
+]

queries.py ADDED Viewed

	@@ -0,0 +1,149 @@

+def summary_query():
+    summary_query = """
+    Please write a summary of the protocol that can be used as the introduction to an informed
+    consent document for a patient to participate in the study described in the protocol.  This
+    summary should be between 500 and 1000 words in length, and should be understandable by a normally
+    intelligent adult patient.  You must explain any medical or technical terms in a way
+    that a high school graduate can understand. The summary should briefly explain the study, the rationale for the study,
+    the risks and benefits of participation, and that participation is entirely voluntary.
+    Start the summary with a level 2 Markdown header (##) titled "Study Summary". Then continue with
+    the content without any further subheadings. Produce the entire summary in Markdown format so it
+    can be nicely printed for the reader.
+    You should assume that the consent form is addressed to the parent of a potentially eligible
+    child, and the very beginnning of the summary should indicate that they are being invited to allow
+    their child to participate because the child is potentially eligible. State why their child is potentially
+    eligible.
+    All the details of this introductory summary should be specific for this protocol.
+    """
+    return summary_query
+def background_query():
+    background_query = """
+    Please write a summary of the protocol that can be used as the background section of an informed
+    consent document for a patient to participate in the study described in the protocol.  This
+    summary should be between 500 and 1000 words in length, and should be understandable by a normally
+    intelligent adult patient.  You must explain any medical or technical terms in a way
+    that a high school graduate can understand. The summary should briefly explain why this patient is being
+    approached to be in the study, including a brief description of the disease that is being studied in the
+    protocol, a description of the study interventions, and the scientific reasons that the investigators believe
+    the intervention might help the patient.
+    Do not include the specific study procedures in this summary, because this will be presented in a different section of
+    the informed consent document.  You also do not need to mention that participation is voluntary, nor
+    the specific risks and benefits of the study, because this information is being presented in a different
+    part of the informed consent document.
+    Start the summary with a level 2 Markdown header (##) titled "Background". Then continue with
+    the content without any further subheadings. Produce the entire summary in Markdown format so it
+    can be nicely printed for the reader.
+    All the details of this background summary should be specific for this protocol.
+    """
+    return background_query
+def number_of_participants_query():
+    number_of_participants_query = """
+    Please write a summary of the protocol that can be used
+    for the "number of participants" section of the informed consent document.  This should include where the
+    study is being conducted (for example, at this hospital, or in a network, or in multiple hospitals), the funding source
+    (often the NIH), the total number of participants that are planned to be enrolled in the study,
+    and the total period of the time that the study is expected to enroll subjects. This summary should not require more than 200 words.
+    Start the summary with a level 2 Markdown header (##) titled "Number of Participants". Then continue with
+    the content without any further subheadings. Produce the entire summary in Markdown format so it
+    can be nicely printed for the reader.
+    All the details of this number of participants summary should be specific for this protocol.
+    """
+    return number_of_participants_query
+def study_procedures_query():
+    study_procedures_query = """
+    Please write a detailed summary of all the study procedures that will be carried out in this protocol.  This will
+    be used for the "study procedures" section of the informed consent document that the patient will read when deciding
+    whether to participate in the study, so it is important that all significant procedures be included.
+    Make sure that everything will be understandable to the reader, who is not trained in medicine.  Do not write
+    the summary as if it is in third person - write it like you are speaking directly to the patient (i.e. use "you" instead
+    of the "patient", with correct grammar of course.)  Do not include a welcome to the study, or discussion about
+    participation being voluntary, as that information is in a different part of the consent document.  Do not include
+    risks and benefits as these are presented in a different part of the consent document.  Please be detailed, as it is
+    important that the patient understand each procedure.
+    The length of this summary is usually
+    2000 to 3000 words.
+    Start the summary with a level 2 Markdown header (##) titled "Study Procedures", and then continue the section with subheadings
+    that will help organize the information for the reader.  Do not go more than two subheadings deep.
+    All the details of study procedures should be specific to this protocol.
+    """
+    return study_procedures_query
+def alt_procedures_query():
+    alt_procedures_query = """
+    Please write a  summary of alternatives to participation in this study.  An example is:
+    " Your participation in this study is voluntary.  It is not necessary to be in this study to get care for
+    your illness.  Monitoring of immune function is not currently done as part of routine ICU care.  There are no
+    other treatments designed to increase immune function of reduce inflammation that are routinely used in
+    children with sepsis."
+    Note that this example is purely an example, and your summary must be specific to the protocol. The summary should
+    be easily understandable by medically untrained readers.  This section is usually less than 500 words in length.
+    Start the summary with a level 2 Markdown header (##) titled "Alternative Procedures", and then continue with
+    the content without any further subheadings. Produce the entire summary in Markdown format so it
+    can be nicely printed for the reader.
+    """
+    return alt_procedures_query
+def risks_query():
+    risks_query = """
+    Please write a detailed summary of the risks of participating in  the study.  This will be used for the
+    "Risks" section of the informed consent document.  It is important that all significant risks of study
+    participation are described in detail. The summary must be easily readable by untrained readers, so provide
+    definitions of technical or medical terms.  Address all the risks by speaking to the patient, not by passively
+    mentioning risks to "the patient".  Especially include risks that are associated with the study interventions such
+    as drugs or devices, or associated with testing that is done as part of the study.  Also include
+    the risks associated with data collection, and also mention "unforeseable risks".
+    The length of this risk summary is usually
+    2000 to 3000 words.
+    Start the summary with a level 2 Markdown header (##) titled "Risks", and then continue the section with subheadings
+    that will help organize the information for the reader.  Do not go more than two subheadings deep.
+    All the details of study risks should be specific to this protocol.
+    """
+    return risks_query
+def benefits_query():
+    benefits_query = """
+    Please write a  summary of the potential benefits of participating in  the study.  This will be used for the
+    "Benefits" section of the informed consent document.  The summary should include potential benefits for the patient
+    (addressed as "you"), and potential benefits for others.  Since this is a research study and it is
+    not known if the intervention is helpful, it is important to not overstate potential benefits for the patient.
+    The length of this risk summary is usually
+    500 to 750 words.
+    Start the summary with a level 2 Markdown header (##) titled "Benefits",  and then continue
+    with a subheading for "Potential
+    Benefits for You" and another subheading for "Potential Benefits for Others".
+    All the information of study benefits should be specific to this protocol.
+    """
+    return benefits_query

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff