Mdean77 commited on
Commit
5e4b78a
·
1 Parent(s): ca66c69

Tried vectorstore cache but failed.

Browse files
Files changed (4) hide show
  1. app.py +38 -11
  2. getVectorstore.py +16 -9
  3. pyproject.toml +1 -1
  4. uv.lock +22 -6
app.py CHANGED
@@ -34,6 +34,7 @@ async def on_chat_start():
34
  ).send()
35
 
36
  file = files[0]
 
37
 
38
  doc = pymupdf.Document(file.path)
39
  toc = doc.get_toc()
@@ -61,12 +62,18 @@ async def on_chat_start():
61
  # need a rect that will exclude headers and footers
62
  rect = pymupdf.Rect(0.0, 100.0, 612.0, 650.0)
63
 
64
- #create the final text
65
  extracted_text = ""
 
 
66
  for page in doc.pages():
67
- if page.number in range(start_page-1, end_page):
 
 
68
  # print(page.get_text(clip=rect))
69
  extracted_text += page.get_text(clip=rect)
 
 
70
  msg = cl.Message(
71
  content=f"""Processing selected file: `{file.name}`...
72
  Extraction beginning on page {start_page} and ending on page {end_page}.
@@ -102,29 +109,29 @@ async def on_chat_start():
102
 
103
  await msg.send()
104
 
 
105
 
106
- qdrant_vectorstore = getVectorstore(document, file.path)
 
107
 
108
- document_titles = ["protocol.pdf", "consent.pdf"]
109
 
110
- # protocol_retriever = qdrant_vectorstore.as_retriever()
111
-
112
- # protocol_retriever = create_protocol_retriever(document_titles)
113
  protocol_retriever = qdrant_vectorstore.as_retriever(
114
  search_kwargs={
115
  'filter': rest.Filter(
116
- must=[
117
  rest.FieldCondition(
118
  key="metadata.document_title",
119
- match=rest.MatchAny(any=document_titles)
120
  )
121
  ]
122
  ),
123
- 'k':15,
124
  }
125
  )
 
126
 
127
-
 
128
  # Create prompt
129
  rag_prompt = ChatPromptTemplate.from_template(prompts.rag_prompt_template)
130
 
@@ -134,3 +141,23 @@ async def on_chat_start():
134
  {"context": itemgetter("question") | protocol_retriever, "question": itemgetter("question")}
135
  | rag_prompt | llm | StrOutputParser()
136
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  ).send()
35
 
36
  file = files[0]
37
+ print(f"filename is {file.name}")
38
 
39
  doc = pymupdf.Document(file.path)
40
  toc = doc.get_toc()
 
62
  # need a rect that will exclude headers and footers
63
  rect = pymupdf.Rect(0.0, 100.0, 612.0, 650.0)
64
 
65
+ #capture the first 2 page
66
  extracted_text = ""
67
+
68
+
69
  for page in doc.pages():
70
+ if page.number in [0, 1, 2]:
71
+ extracted_text += page.get_text()
72
+ elif page.number in range(start_page-1, end_page):
73
  # print(page.get_text(clip=rect))
74
  extracted_text += page.get_text(clip=rect)
75
+
76
+
77
  msg = cl.Message(
78
  content=f"""Processing selected file: `{file.name}`...
79
  Extraction beginning on page {start_page} and ending on page {end_page}.
 
109
 
110
  await msg.send()
111
 
112
+ qdrant_vectorstore = getVectorstore(document, file.name)
113
 
114
+ protocol_retriever = qdrant_vectorstore.as_retriever(search_kwargs={"k":15})
115
+ # document_titles = [file.name]
116
 
 
117
 
 
 
 
118
  protocol_retriever = qdrant_vectorstore.as_retriever(
119
  search_kwargs={
120
  'filter': rest.Filter(
121
+ must=[
122
  rest.FieldCondition(
123
  key="metadata.document_title",
124
+ match=rest.MatchAny(any=[file.name])
125
  )
126
  ]
127
  ),
128
+ 'k': 15,
129
  }
130
  )
131
+ # # protocol_retriever = qdrant_vectorstore.as_retriever()
132
 
133
+ # protocol_retriever = create_protocol_retriever(document_titles)
134
+
135
  # Create prompt
136
  rag_prompt = ChatPromptTemplate.from_template(prompts.rag_prompt_template)
137
 
 
141
  {"context": itemgetter("question") | protocol_retriever, "question": itemgetter("question")}
142
  | rag_prompt | llm | StrOutputParser()
143
  )
144
+
145
+ from datetime import date
146
+ # Heading for top of ICF document
147
+ protocol_title = rag_chain.invoke({"question": "What is the exact title of this protocol? Only return the title itself without any other description."})
148
+ principal_investigator = rag_chain.invoke({"question":"What is the name of the principal investigator of the study? Only return the name itself without any other description."})
149
+ support = rag_chain.invoke({"question":"What agency is funding the study? Only return the name of the agency without any other description."})
150
+ version_date = date.today().strftime("%B %d, %Y")
151
+
152
+ msg = cl.Message(
153
+ content=f"""
154
+ **Study Title:** {protocol_title}
155
+ **Principal Investigator:** {principal_investigator}
156
+ **Version Date:** {version_date}
157
+ **Source of Support:** {support}
158
+ ---
159
+ """
160
+ )
161
+
162
+ await msg.send()
163
+
getVectorstore.py CHANGED
@@ -19,29 +19,36 @@ This could also be useful if different versions of documents are in existence.
19
  recreate a large vectorstore. But the user could select the most recent version.
20
  """
21
 
22
-
23
  def get_document_hash(doc_content):
24
  """Generate a unique hash for the document content."""
25
  return hashlib.md5(doc_content.encode()).hexdigest()
26
 
27
- def getVectorstore(document, file_path):
28
  # Add a unique hash to your documents
29
  for doc in document:
30
  doc.metadata['content_hash'] = get_document_hash(doc.page_content)
31
 
32
  # Add the document title
33
  for doc in document:
34
- doc.metadata['document_title'] = file_path.split('/')[-1]
 
 
 
 
35
 
 
 
 
36
  client = QdrantClient( url=qdrant_url)
37
- # client = QdrantClient(":memory:")
 
38
  # If the collection exists, then we need to check to see if our document is already
39
  # present, in which case we would not want to store it again.
40
- if client.collection_exists("protocol_collection"):
41
  print("Collection exists")
42
  qdrant_vectorstore = QdrantVectorStore.from_existing_collection(
43
  embedding=embedding_model,
44
- collection_name="protocol_collection",
45
  url=qdrant_url
46
  # location = ":memory:"
47
  )
@@ -61,7 +68,7 @@ def getVectorstore(document, file_path):
61
  )
62
 
63
  scroll_results = client.scroll(
64
- collection_name="protocol_collection",
65
  scroll_filter=scroll_filter,
66
  limit=len(document) # Adjust this if you have a large number of documents
67
  )
@@ -82,8 +89,8 @@ def getVectorstore(document, file_path):
82
  qdrant_vectorstore = QdrantVectorStore.from_documents(
83
  documents=document,
84
  embedding=embedding_model,
85
- collection_name="protocol_collection",
86
  # location = ":memory:"
87
  url=qdrant_url
88
  )
89
- return qdrant_vectorstore
 
19
  recreate a large vectorstore. But the user could select the most recent version.
20
  """
21
 
 
22
  def get_document_hash(doc_content):
23
  """Generate a unique hash for the document content."""
24
  return hashlib.md5(doc_content.encode()).hexdigest()
25
 
26
+ def getVectorstore(document, file_name):
27
  # Add a unique hash to your documents
28
  for doc in document:
29
  doc.metadata['content_hash'] = get_document_hash(doc.page_content)
30
 
31
  # Add the document title
32
  for doc in document:
33
+ doc.metadata['document_title'] = file_name
34
+
35
+ # Add page to metadata
36
+ for i, doc in enumerate(document):
37
+ doc.metadata['source'] = f"source_{i}"
38
 
39
+ # collection_name = f"pdf_to_parse_{uuid.uuid4()}"
40
+ collection_name = "protocol_collection"
41
+
42
  client = QdrantClient( url=qdrant_url)
43
+
44
+
45
  # If the collection exists, then we need to check to see if our document is already
46
  # present, in which case we would not want to store it again.
47
+ if client.collection_exists(collection_name):
48
  print("Collection exists")
49
  qdrant_vectorstore = QdrantVectorStore.from_existing_collection(
50
  embedding=embedding_model,
51
+ collection_name=collection_name,
52
  url=qdrant_url
53
  # location = ":memory:"
54
  )
 
68
  )
69
 
70
  scroll_results = client.scroll(
71
+ collection_name=collection_name,
72
  scroll_filter=scroll_filter,
73
  limit=len(document) # Adjust this if you have a large number of documents
74
  )
 
89
  qdrant_vectorstore = QdrantVectorStore.from_documents(
90
  documents=document,
91
  embedding=embedding_model,
92
+ collection_name=collection_name,
93
  # location = ":memory:"
94
  url=qdrant_url
95
  )
96
+ return qdrant_vectorstore
pyproject.toml CHANGED
@@ -14,8 +14,8 @@ dependencies = [
14
  "langchain-openai>=0.2.14",
15
  "langgraph>=0.2.60",
16
  "pymupdf>=1.25.1",
17
- "chainlit>=1.1.202",
18
  "websockets>=14.1",
 
19
  ]
20
 
21
  [dependency-groups]
 
14
  "langchain-openai>=0.2.14",
15
  "langgraph>=0.2.60",
16
  "pymupdf>=1.25.1",
 
17
  "websockets>=14.1",
18
+ "chainlit>=1.1.202",
19
  ]
20
 
21
  [dependency-groups]
uv.lock CHANGED
@@ -258,7 +258,7 @@ dependencies = [
258
  { name = "pydantic", marker = "python_full_version >= '3.13'" },
259
  { name = "pyjwt", marker = "python_full_version >= '3.13'" },
260
  { name = "python-dotenv", marker = "python_full_version >= '3.13'" },
261
- { name = "python-multipart", marker = "python_full_version >= '3.13'" },
262
  { name = "starlette", version = "0.37.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
263
  { name = "syncer", marker = "python_full_version >= '3.13'" },
264
  { name = "tomli", marker = "python_full_version >= '3.13'" },
@@ -273,7 +273,7 @@ wheels = [
273
 
274
  [[package]]
275
  name = "chainlit"
276
- version = "1.3.2"
277
  source = { registry = "https://pypi.org/simple" }
278
  resolution-markers = [
279
  "python_full_version < '3.12.4'",
@@ -295,7 +295,7 @@ dependencies = [
295
  { name = "pydantic", marker = "python_full_version < '3.13'" },
296
  { name = "pyjwt", marker = "python_full_version < '3.13'" },
297
  { name = "python-dotenv", marker = "python_full_version < '3.13'" },
298
- { name = "python-multipart", marker = "python_full_version < '3.13'" },
299
  { name = "python-socketio", marker = "python_full_version < '3.13'" },
300
  { name = "starlette", version = "0.41.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13'" },
301
  { name = "syncer", marker = "python_full_version < '3.13'" },
@@ -304,9 +304,9 @@ dependencies = [
304
  { name = "uvicorn", marker = "python_full_version < '3.13'" },
305
  { name = "watchfiles", marker = "python_full_version < '3.13'" },
306
  ]
307
- sdist = { url = "https://files.pythonhosted.org/packages/6a/57/8c3c354b3e82fae0791b58304634b356aeea8791dd0b6f817a8d8787cad6/chainlit-1.3.2.tar.gz", hash = "sha256:5562246ca42c858a42e86efdd5a46c27fd93481706f6fe47416109c47ac41b77", size = 4271845 }
308
  wheels = [
309
- { url = "https://files.pythonhosted.org/packages/ca/d8/2a0c00b2236c93f31df429642de6414c56510d39392101b1b102074520d0/chainlit-1.3.2-py3-none-any.whl", hash = "sha256:c0926e8b157a67aede511a8220307889d04e9458ff3f94176e004fe533662d74", size = 4332216 },
310
  ]
311
 
312
  [[package]]
@@ -783,7 +783,7 @@ version = "0.1.0"
783
  source = { virtual = "." }
784
  dependencies = [
785
  { name = "chainlit", version = "1.1.202", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
786
- { name = "chainlit", version = "1.3.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13'" },
787
  { name = "langchain" },
788
  { name = "langchain-community" },
789
  { name = "langchain-openai" },
@@ -1808,11 +1808,27 @@ wheels = [
1808
  name = "python-multipart"
1809
  version = "0.0.9"
1810
  source = { registry = "https://pypi.org/simple" }
 
 
 
1811
  sdist = { url = "https://files.pythonhosted.org/packages/5c/0f/9c55ac6c84c0336e22a26fa84ca6c51d58d7ac3a2d78b0dfa8748826c883/python_multipart-0.0.9.tar.gz", hash = "sha256:03f54688c663f1b7977105f021043b0793151e4cb1c1a9d4a11fc13d622c4026", size = 31516 }
1812
  wheels = [
1813
  { url = "https://files.pythonhosted.org/packages/3d/47/444768600d9e0ebc82f8e347775d24aef8f6348cf00e9fa0e81910814e6d/python_multipart-0.0.9-py3-none-any.whl", hash = "sha256:97ca7b8ea7b05f977dc3849c3ba99d51689822fab725c3703af7c866a0c2b215", size = 22299 },
1814
  ]
1815
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1816
  [[package]]
1817
  name = "python-socketio"
1818
  version = "5.12.1"
 
258
  { name = "pydantic", marker = "python_full_version >= '3.13'" },
259
  { name = "pyjwt", marker = "python_full_version >= '3.13'" },
260
  { name = "python-dotenv", marker = "python_full_version >= '3.13'" },
261
+ { name = "python-multipart", version = "0.0.9", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
262
  { name = "starlette", version = "0.37.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
263
  { name = "syncer", marker = "python_full_version >= '3.13'" },
264
  { name = "tomli", marker = "python_full_version >= '3.13'" },
 
273
 
274
  [[package]]
275
  name = "chainlit"
276
+ version = "2.0.0"
277
  source = { registry = "https://pypi.org/simple" }
278
  resolution-markers = [
279
  "python_full_version < '3.12.4'",
 
295
  { name = "pydantic", marker = "python_full_version < '3.13'" },
296
  { name = "pyjwt", marker = "python_full_version < '3.13'" },
297
  { name = "python-dotenv", marker = "python_full_version < '3.13'" },
298
+ { name = "python-multipart", version = "0.0.18", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13'" },
299
  { name = "python-socketio", marker = "python_full_version < '3.13'" },
300
  { name = "starlette", version = "0.41.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13'" },
301
  { name = "syncer", marker = "python_full_version < '3.13'" },
 
304
  { name = "uvicorn", marker = "python_full_version < '3.13'" },
305
  { name = "watchfiles", marker = "python_full_version < '3.13'" },
306
  ]
307
+ sdist = { url = "https://files.pythonhosted.org/packages/45/24/424679b769664876093b3e42167911535d1739bc1bc88f3963c69affed9e/chainlit-2.0.0.tar.gz", hash = "sha256:47b3a274a20cefb443f356d69f1c6a48818d67eb4a11552c749bfa6f414423ed", size = 4637040 }
308
  wheels = [
309
+ { url = "https://files.pythonhosted.org/packages/87/2a/e2bbb86fc3a34c7bf798644edb95bf14fd79a8b3f6c99e4b27e5df1e24f0/chainlit-2.0.0-py3-none-any.whl", hash = "sha256:2b58ac6b513d94aef0380d1d68b73f74718c0c844586b050ce8d5e0a82eb8133", size = 4703622 },
310
  ]
311
 
312
  [[package]]
 
783
  source = { virtual = "." }
784
  dependencies = [
785
  { name = "chainlit", version = "1.1.202", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
786
+ { name = "chainlit", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13'" },
787
  { name = "langchain" },
788
  { name = "langchain-community" },
789
  { name = "langchain-openai" },
 
1808
  name = "python-multipart"
1809
  version = "0.0.9"
1810
  source = { registry = "https://pypi.org/simple" }
1811
+ resolution-markers = [
1812
+ "python_full_version >= '3.13'",
1813
+ ]
1814
  sdist = { url = "https://files.pythonhosted.org/packages/5c/0f/9c55ac6c84c0336e22a26fa84ca6c51d58d7ac3a2d78b0dfa8748826c883/python_multipart-0.0.9.tar.gz", hash = "sha256:03f54688c663f1b7977105f021043b0793151e4cb1c1a9d4a11fc13d622c4026", size = 31516 }
1815
  wheels = [
1816
  { url = "https://files.pythonhosted.org/packages/3d/47/444768600d9e0ebc82f8e347775d24aef8f6348cf00e9fa0e81910814e6d/python_multipart-0.0.9-py3-none-any.whl", hash = "sha256:97ca7b8ea7b05f977dc3849c3ba99d51689822fab725c3703af7c866a0c2b215", size = 22299 },
1817
  ]
1818
 
1819
+ [[package]]
1820
+ name = "python-multipart"
1821
+ version = "0.0.18"
1822
+ source = { registry = "https://pypi.org/simple" }
1823
+ resolution-markers = [
1824
+ "python_full_version < '3.12.4'",
1825
+ "python_full_version >= '3.12.4' and python_full_version < '3.13'",
1826
+ ]
1827
+ sdist = { url = "https://files.pythonhosted.org/packages/b4/86/b6b38677dec2e2e7898fc5b6f7e42c2d011919a92d25339451892f27b89c/python_multipart-0.0.18.tar.gz", hash = "sha256:7a68db60c8bfb82e460637fa4750727b45af1d5e2ed215593f917f64694d34fe", size = 36622 }
1828
+ wheels = [
1829
+ { url = "https://files.pythonhosted.org/packages/13/6b/b60f47101ba2cac66b4a83246630e68ae9bbe2e614cbae5f4465f46dee13/python_multipart-0.0.18-py3-none-any.whl", hash = "sha256:efe91480f485f6a361427a541db4796f9e1591afc0fb8e7a4ba06bfbc6708996", size = 24389 },
1830
+ ]
1831
+
1832
  [[package]]
1833
  name = "python-socketio"
1834
  version = "5.12.1"