Spaces:
Runtime error
Runtime error
Tried vectorstore cache but failed.
Browse files- app.py +38 -11
- getVectorstore.py +16 -9
- pyproject.toml +1 -1
- uv.lock +22 -6
app.py
CHANGED
@@ -34,6 +34,7 @@ async def on_chat_start():
|
|
34 |
).send()
|
35 |
|
36 |
file = files[0]
|
|
|
37 |
|
38 |
doc = pymupdf.Document(file.path)
|
39 |
toc = doc.get_toc()
|
@@ -61,12 +62,18 @@ async def on_chat_start():
|
|
61 |
# need a rect that will exclude headers and footers
|
62 |
rect = pymupdf.Rect(0.0, 100.0, 612.0, 650.0)
|
63 |
|
64 |
-
#
|
65 |
extracted_text = ""
|
|
|
|
|
66 |
for page in doc.pages():
|
67 |
-
if page.number in
|
|
|
|
|
68 |
# print(page.get_text(clip=rect))
|
69 |
extracted_text += page.get_text(clip=rect)
|
|
|
|
|
70 |
msg = cl.Message(
|
71 |
content=f"""Processing selected file: `{file.name}`...
|
72 |
Extraction beginning on page {start_page} and ending on page {end_page}.
|
@@ -102,29 +109,29 @@ async def on_chat_start():
|
|
102 |
|
103 |
await msg.send()
|
104 |
|
|
|
105 |
|
106 |
-
|
|
|
107 |
|
108 |
-
document_titles = ["protocol.pdf", "consent.pdf"]
|
109 |
|
110 |
-
# protocol_retriever = qdrant_vectorstore.as_retriever()
|
111 |
-
|
112 |
-
# protocol_retriever = create_protocol_retriever(document_titles)
|
113 |
protocol_retriever = qdrant_vectorstore.as_retriever(
|
114 |
search_kwargs={
|
115 |
'filter': rest.Filter(
|
116 |
-
|
117 |
rest.FieldCondition(
|
118 |
key="metadata.document_title",
|
119 |
-
|
120 |
)
|
121 |
]
|
122 |
),
|
123 |
-
'k':15,
|
124 |
}
|
125 |
)
|
|
|
126 |
|
127 |
-
|
|
|
128 |
# Create prompt
|
129 |
rag_prompt = ChatPromptTemplate.from_template(prompts.rag_prompt_template)
|
130 |
|
@@ -134,3 +141,23 @@ async def on_chat_start():
|
|
134 |
{"context": itemgetter("question") | protocol_retriever, "question": itemgetter("question")}
|
135 |
| rag_prompt | llm | StrOutputParser()
|
136 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
).send()
|
35 |
|
36 |
file = files[0]
|
37 |
+
print(f"filename is {file.name}")
|
38 |
|
39 |
doc = pymupdf.Document(file.path)
|
40 |
toc = doc.get_toc()
|
|
|
62 |
# need a rect that will exclude headers and footers
|
63 |
rect = pymupdf.Rect(0.0, 100.0, 612.0, 650.0)
|
64 |
|
65 |
+
#capture the first 2 page
|
66 |
extracted_text = ""
|
67 |
+
|
68 |
+
|
69 |
for page in doc.pages():
|
70 |
+
if page.number in [0, 1, 2]:
|
71 |
+
extracted_text += page.get_text()
|
72 |
+
elif page.number in range(start_page-1, end_page):
|
73 |
# print(page.get_text(clip=rect))
|
74 |
extracted_text += page.get_text(clip=rect)
|
75 |
+
|
76 |
+
|
77 |
msg = cl.Message(
|
78 |
content=f"""Processing selected file: `{file.name}`...
|
79 |
Extraction beginning on page {start_page} and ending on page {end_page}.
|
|
|
109 |
|
110 |
await msg.send()
|
111 |
|
112 |
+
qdrant_vectorstore = getVectorstore(document, file.name)
|
113 |
|
114 |
+
protocol_retriever = qdrant_vectorstore.as_retriever(search_kwargs={"k":15})
|
115 |
+
# document_titles = [file.name]
|
116 |
|
|
|
117 |
|
|
|
|
|
|
|
118 |
protocol_retriever = qdrant_vectorstore.as_retriever(
|
119 |
search_kwargs={
|
120 |
'filter': rest.Filter(
|
121 |
+
must=[
|
122 |
rest.FieldCondition(
|
123 |
key="metadata.document_title",
|
124 |
+
match=rest.MatchAny(any=[file.name])
|
125 |
)
|
126 |
]
|
127 |
),
|
128 |
+
'k': 15,
|
129 |
}
|
130 |
)
|
131 |
+
# # protocol_retriever = qdrant_vectorstore.as_retriever()
|
132 |
|
133 |
+
# protocol_retriever = create_protocol_retriever(document_titles)
|
134 |
+
|
135 |
# Create prompt
|
136 |
rag_prompt = ChatPromptTemplate.from_template(prompts.rag_prompt_template)
|
137 |
|
|
|
141 |
{"context": itemgetter("question") | protocol_retriever, "question": itemgetter("question")}
|
142 |
| rag_prompt | llm | StrOutputParser()
|
143 |
)
|
144 |
+
|
145 |
+
from datetime import date
|
146 |
+
# Heading for top of ICF document
|
147 |
+
protocol_title = rag_chain.invoke({"question": "What is the exact title of this protocol? Only return the title itself without any other description."})
|
148 |
+
principal_investigator = rag_chain.invoke({"question":"What is the name of the principal investigator of the study? Only return the name itself without any other description."})
|
149 |
+
support = rag_chain.invoke({"question":"What agency is funding the study? Only return the name of the agency without any other description."})
|
150 |
+
version_date = date.today().strftime("%B %d, %Y")
|
151 |
+
|
152 |
+
msg = cl.Message(
|
153 |
+
content=f"""
|
154 |
+
**Study Title:** {protocol_title}
|
155 |
+
**Principal Investigator:** {principal_investigator}
|
156 |
+
**Version Date:** {version_date}
|
157 |
+
**Source of Support:** {support}
|
158 |
+
---
|
159 |
+
"""
|
160 |
+
)
|
161 |
+
|
162 |
+
await msg.send()
|
163 |
+
|
getVectorstore.py
CHANGED
@@ -19,29 +19,36 @@ This could also be useful if different versions of documents are in existence.
|
|
19 |
recreate a large vectorstore. But the user could select the most recent version.
|
20 |
"""
|
21 |
|
22 |
-
|
23 |
def get_document_hash(doc_content):
|
24 |
"""Generate a unique hash for the document content."""
|
25 |
return hashlib.md5(doc_content.encode()).hexdigest()
|
26 |
|
27 |
-
def getVectorstore(document,
|
28 |
# Add a unique hash to your documents
|
29 |
for doc in document:
|
30 |
doc.metadata['content_hash'] = get_document_hash(doc.page_content)
|
31 |
|
32 |
# Add the document title
|
33 |
for doc in document:
|
34 |
-
doc.metadata['document_title'] =
|
|
|
|
|
|
|
|
|
35 |
|
|
|
|
|
|
|
36 |
client = QdrantClient( url=qdrant_url)
|
37 |
-
|
|
|
38 |
# If the collection exists, then we need to check to see if our document is already
|
39 |
# present, in which case we would not want to store it again.
|
40 |
-
if client.collection_exists(
|
41 |
print("Collection exists")
|
42 |
qdrant_vectorstore = QdrantVectorStore.from_existing_collection(
|
43 |
embedding=embedding_model,
|
44 |
-
collection_name=
|
45 |
url=qdrant_url
|
46 |
# location = ":memory:"
|
47 |
)
|
@@ -61,7 +68,7 @@ def getVectorstore(document, file_path):
|
|
61 |
)
|
62 |
|
63 |
scroll_results = client.scroll(
|
64 |
-
collection_name=
|
65 |
scroll_filter=scroll_filter,
|
66 |
limit=len(document) # Adjust this if you have a large number of documents
|
67 |
)
|
@@ -82,8 +89,8 @@ def getVectorstore(document, file_path):
|
|
82 |
qdrant_vectorstore = QdrantVectorStore.from_documents(
|
83 |
documents=document,
|
84 |
embedding=embedding_model,
|
85 |
-
collection_name=
|
86 |
# location = ":memory:"
|
87 |
url=qdrant_url
|
88 |
)
|
89 |
-
return qdrant_vectorstore
|
|
|
19 |
recreate a large vectorstore. But the user could select the most recent version.
|
20 |
"""
|
21 |
|
|
|
22 |
def get_document_hash(doc_content):
|
23 |
"""Generate a unique hash for the document content."""
|
24 |
return hashlib.md5(doc_content.encode()).hexdigest()
|
25 |
|
26 |
+
def getVectorstore(document, file_name):
|
27 |
# Add a unique hash to your documents
|
28 |
for doc in document:
|
29 |
doc.metadata['content_hash'] = get_document_hash(doc.page_content)
|
30 |
|
31 |
# Add the document title
|
32 |
for doc in document:
|
33 |
+
doc.metadata['document_title'] = file_name
|
34 |
+
|
35 |
+
# Add page to metadata
|
36 |
+
for i, doc in enumerate(document):
|
37 |
+
doc.metadata['source'] = f"source_{i}"
|
38 |
|
39 |
+
# collection_name = f"pdf_to_parse_{uuid.uuid4()}"
|
40 |
+
collection_name = "protocol_collection"
|
41 |
+
|
42 |
client = QdrantClient( url=qdrant_url)
|
43 |
+
|
44 |
+
|
45 |
# If the collection exists, then we need to check to see if our document is already
|
46 |
# present, in which case we would not want to store it again.
|
47 |
+
if client.collection_exists(collection_name):
|
48 |
print("Collection exists")
|
49 |
qdrant_vectorstore = QdrantVectorStore.from_existing_collection(
|
50 |
embedding=embedding_model,
|
51 |
+
collection_name=collection_name,
|
52 |
url=qdrant_url
|
53 |
# location = ":memory:"
|
54 |
)
|
|
|
68 |
)
|
69 |
|
70 |
scroll_results = client.scroll(
|
71 |
+
collection_name=collection_name,
|
72 |
scroll_filter=scroll_filter,
|
73 |
limit=len(document) # Adjust this if you have a large number of documents
|
74 |
)
|
|
|
89 |
qdrant_vectorstore = QdrantVectorStore.from_documents(
|
90 |
documents=document,
|
91 |
embedding=embedding_model,
|
92 |
+
collection_name=collection_name,
|
93 |
# location = ":memory:"
|
94 |
url=qdrant_url
|
95 |
)
|
96 |
+
return qdrant_vectorstore
|
pyproject.toml
CHANGED
@@ -14,8 +14,8 @@ dependencies = [
|
|
14 |
"langchain-openai>=0.2.14",
|
15 |
"langgraph>=0.2.60",
|
16 |
"pymupdf>=1.25.1",
|
17 |
-
"chainlit>=1.1.202",
|
18 |
"websockets>=14.1",
|
|
|
19 |
]
|
20 |
|
21 |
[dependency-groups]
|
|
|
14 |
"langchain-openai>=0.2.14",
|
15 |
"langgraph>=0.2.60",
|
16 |
"pymupdf>=1.25.1",
|
|
|
17 |
"websockets>=14.1",
|
18 |
+
"chainlit>=1.1.202",
|
19 |
]
|
20 |
|
21 |
[dependency-groups]
|
uv.lock
CHANGED
@@ -258,7 +258,7 @@ dependencies = [
|
|
258 |
{ name = "pydantic", marker = "python_full_version >= '3.13'" },
|
259 |
{ name = "pyjwt", marker = "python_full_version >= '3.13'" },
|
260 |
{ name = "python-dotenv", marker = "python_full_version >= '3.13'" },
|
261 |
-
{ name = "python-multipart", marker = "python_full_version >= '3.13'" },
|
262 |
{ name = "starlette", version = "0.37.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
|
263 |
{ name = "syncer", marker = "python_full_version >= '3.13'" },
|
264 |
{ name = "tomli", marker = "python_full_version >= '3.13'" },
|
@@ -273,7 +273,7 @@ wheels = [
|
|
273 |
|
274 |
[[package]]
|
275 |
name = "chainlit"
|
276 |
-
version = "
|
277 |
source = { registry = "https://pypi.org/simple" }
|
278 |
resolution-markers = [
|
279 |
"python_full_version < '3.12.4'",
|
@@ -295,7 +295,7 @@ dependencies = [
|
|
295 |
{ name = "pydantic", marker = "python_full_version < '3.13'" },
|
296 |
{ name = "pyjwt", marker = "python_full_version < '3.13'" },
|
297 |
{ name = "python-dotenv", marker = "python_full_version < '3.13'" },
|
298 |
-
{ name = "python-multipart", marker = "python_full_version < '3.13'" },
|
299 |
{ name = "python-socketio", marker = "python_full_version < '3.13'" },
|
300 |
{ name = "starlette", version = "0.41.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13'" },
|
301 |
{ name = "syncer", marker = "python_full_version < '3.13'" },
|
@@ -304,9 +304,9 @@ dependencies = [
|
|
304 |
{ name = "uvicorn", marker = "python_full_version < '3.13'" },
|
305 |
{ name = "watchfiles", marker = "python_full_version < '3.13'" },
|
306 |
]
|
307 |
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
308 |
wheels = [
|
309 |
-
{ url = "https://files.pythonhosted.org/packages/
|
310 |
]
|
311 |
|
312 |
[[package]]
|
@@ -783,7 +783,7 @@ version = "0.1.0"
|
|
783 |
source = { virtual = "." }
|
784 |
dependencies = [
|
785 |
{ name = "chainlit", version = "1.1.202", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
|
786 |
-
{ name = "chainlit", version = "
|
787 |
{ name = "langchain" },
|
788 |
{ name = "langchain-community" },
|
789 |
{ name = "langchain-openai" },
|
@@ -1808,11 +1808,27 @@ wheels = [
|
|
1808 |
name = "python-multipart"
|
1809 |
version = "0.0.9"
|
1810 |
source = { registry = "https://pypi.org/simple" }
|
|
|
|
|
|
|
1811 |
sdist = { url = "https://files.pythonhosted.org/packages/5c/0f/9c55ac6c84c0336e22a26fa84ca6c51d58d7ac3a2d78b0dfa8748826c883/python_multipart-0.0.9.tar.gz", hash = "sha256:03f54688c663f1b7977105f021043b0793151e4cb1c1a9d4a11fc13d622c4026", size = 31516 }
|
1812 |
wheels = [
|
1813 |
{ url = "https://files.pythonhosted.org/packages/3d/47/444768600d9e0ebc82f8e347775d24aef8f6348cf00e9fa0e81910814e6d/python_multipart-0.0.9-py3-none-any.whl", hash = "sha256:97ca7b8ea7b05f977dc3849c3ba99d51689822fab725c3703af7c866a0c2b215", size = 22299 },
|
1814 |
]
|
1815 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1816 |
[[package]]
|
1817 |
name = "python-socketio"
|
1818 |
version = "5.12.1"
|
|
|
258 |
{ name = "pydantic", marker = "python_full_version >= '3.13'" },
|
259 |
{ name = "pyjwt", marker = "python_full_version >= '3.13'" },
|
260 |
{ name = "python-dotenv", marker = "python_full_version >= '3.13'" },
|
261 |
+
{ name = "python-multipart", version = "0.0.9", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
|
262 |
{ name = "starlette", version = "0.37.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
|
263 |
{ name = "syncer", marker = "python_full_version >= '3.13'" },
|
264 |
{ name = "tomli", marker = "python_full_version >= '3.13'" },
|
|
|
273 |
|
274 |
[[package]]
|
275 |
name = "chainlit"
|
276 |
+
version = "2.0.0"
|
277 |
source = { registry = "https://pypi.org/simple" }
|
278 |
resolution-markers = [
|
279 |
"python_full_version < '3.12.4'",
|
|
|
295 |
{ name = "pydantic", marker = "python_full_version < '3.13'" },
|
296 |
{ name = "pyjwt", marker = "python_full_version < '3.13'" },
|
297 |
{ name = "python-dotenv", marker = "python_full_version < '3.13'" },
|
298 |
+
{ name = "python-multipart", version = "0.0.18", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13'" },
|
299 |
{ name = "python-socketio", marker = "python_full_version < '3.13'" },
|
300 |
{ name = "starlette", version = "0.41.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13'" },
|
301 |
{ name = "syncer", marker = "python_full_version < '3.13'" },
|
|
|
304 |
{ name = "uvicorn", marker = "python_full_version < '3.13'" },
|
305 |
{ name = "watchfiles", marker = "python_full_version < '3.13'" },
|
306 |
]
|
307 |
+
sdist = { url = "https://files.pythonhosted.org/packages/45/24/424679b769664876093b3e42167911535d1739bc1bc88f3963c69affed9e/chainlit-2.0.0.tar.gz", hash = "sha256:47b3a274a20cefb443f356d69f1c6a48818d67eb4a11552c749bfa6f414423ed", size = 4637040 }
|
308 |
wheels = [
|
309 |
+
{ url = "https://files.pythonhosted.org/packages/87/2a/e2bbb86fc3a34c7bf798644edb95bf14fd79a8b3f6c99e4b27e5df1e24f0/chainlit-2.0.0-py3-none-any.whl", hash = "sha256:2b58ac6b513d94aef0380d1d68b73f74718c0c844586b050ce8d5e0a82eb8133", size = 4703622 },
|
310 |
]
|
311 |
|
312 |
[[package]]
|
|
|
783 |
source = { virtual = "." }
|
784 |
dependencies = [
|
785 |
{ name = "chainlit", version = "1.1.202", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
|
786 |
+
{ name = "chainlit", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13'" },
|
787 |
{ name = "langchain" },
|
788 |
{ name = "langchain-community" },
|
789 |
{ name = "langchain-openai" },
|
|
|
1808 |
name = "python-multipart"
|
1809 |
version = "0.0.9"
|
1810 |
source = { registry = "https://pypi.org/simple" }
|
1811 |
+
resolution-markers = [
|
1812 |
+
"python_full_version >= '3.13'",
|
1813 |
+
]
|
1814 |
sdist = { url = "https://files.pythonhosted.org/packages/5c/0f/9c55ac6c84c0336e22a26fa84ca6c51d58d7ac3a2d78b0dfa8748826c883/python_multipart-0.0.9.tar.gz", hash = "sha256:03f54688c663f1b7977105f021043b0793151e4cb1c1a9d4a11fc13d622c4026", size = 31516 }
|
1815 |
wheels = [
|
1816 |
{ url = "https://files.pythonhosted.org/packages/3d/47/444768600d9e0ebc82f8e347775d24aef8f6348cf00e9fa0e81910814e6d/python_multipart-0.0.9-py3-none-any.whl", hash = "sha256:97ca7b8ea7b05f977dc3849c3ba99d51689822fab725c3703af7c866a0c2b215", size = 22299 },
|
1817 |
]
|
1818 |
|
1819 |
+
[[package]]
|
1820 |
+
name = "python-multipart"
|
1821 |
+
version = "0.0.18"
|
1822 |
+
source = { registry = "https://pypi.org/simple" }
|
1823 |
+
resolution-markers = [
|
1824 |
+
"python_full_version < '3.12.4'",
|
1825 |
+
"python_full_version >= '3.12.4' and python_full_version < '3.13'",
|
1826 |
+
]
|
1827 |
+
sdist = { url = "https://files.pythonhosted.org/packages/b4/86/b6b38677dec2e2e7898fc5b6f7e42c2d011919a92d25339451892f27b89c/python_multipart-0.0.18.tar.gz", hash = "sha256:7a68db60c8bfb82e460637fa4750727b45af1d5e2ed215593f917f64694d34fe", size = 36622 }
|
1828 |
+
wheels = [
|
1829 |
+
{ url = "https://files.pythonhosted.org/packages/13/6b/b60f47101ba2cac66b4a83246630e68ae9bbe2e614cbae5f4465f46dee13/python_multipart-0.0.18-py3-none-any.whl", hash = "sha256:efe91480f485f6a361427a541db4796f9e1591afc0fb8e7a4ba06bfbc6708996", size = 24389 },
|
1830 |
+
]
|
1831 |
+
|
1832 |
[[package]]
|
1833 |
name = "python-socketio"
|
1834 |
version = "5.12.1"
|