Informed_Consent / getVectorstore.py
Mdean77's picture
Tried vectorstore cache but failed.
5e4b78a
from qdrant_client import QdrantClient
from qdrant_client.http import models as rest
from langchain_qdrant import QdrantVectorStore
import hashlib
import defaults
embedding_model = defaults.default_embedding_model
qdrant_url = defaults.default_url
"""
This code creates a hash for every chunk and checks to see if that chunk already exists in the
vector database. We only want one collection in Qdrant, but want to make sure that if a user
selects a document that has already been embedded and stored, it does not get stored again. We
also add metadata for the document title, so that we can make our retriever focus on documents of
interest. For example, after some usage, the application might have 20 documents for the user to
select from. We want the retriever to be exactly right for the documents that they selected.
This could also be useful if different versions of documents are in existence. We would not want to
recreate a large vectorstore. But the user could select the most recent version.
"""
def get_document_hash(doc_content):
"""Generate a unique hash for the document content."""
return hashlib.md5(doc_content.encode()).hexdigest()
def getVectorstore(document, file_name):
# Add a unique hash to your documents
for doc in document:
doc.metadata['content_hash'] = get_document_hash(doc.page_content)
# Add the document title
for doc in document:
doc.metadata['document_title'] = file_name
# Add page to metadata
for i, doc in enumerate(document):
doc.metadata['source'] = f"source_{i}"
# collection_name = f"pdf_to_parse_{uuid.uuid4()}"
collection_name = "protocol_collection"
client = QdrantClient( url=qdrant_url)
# If the collection exists, then we need to check to see if our document is already
# present, in which case we would not want to store it again.
if client.collection_exists(collection_name):
print("Collection exists")
qdrant_vectorstore = QdrantVectorStore.from_existing_collection(
embedding=embedding_model,
collection_name=collection_name,
url=qdrant_url
# location = ":memory:"
)
# Check for existing documents and only add new ones
existing_hashes = set()
new_docs = []
# Get all existing hashes
scroll_filter = rest.Filter(
should=[
rest.FieldCondition(
key="metadata.content_hash",
match=rest.MatchValue(value=doc.metadata['content_hash'])
) for doc in document
]
)
scroll_results = client.scroll(
collection_name=collection_name,
scroll_filter=scroll_filter,
limit=len(document) # Adjust this if you have a large number of documents
)
existing_hashes = set(point.payload.get('metadata', {}).get('content_hash') for point in scroll_results[0])
for doc in document:
if doc.metadata['content_hash'] not in existing_hashes:
new_docs.append(doc)
if new_docs:
qdrant_vectorstore.add_documents(new_docs)
print(f"Added {len(new_docs)} new documents")
print(f"Skipped {len(existing_hashes)} existing documents")
else:
print("Collection does not exist") #So we go ahead and just add the documents
qdrant_vectorstore = QdrantVectorStore.from_documents(
documents=document,
embedding=embedding_model,
collection_name=collection_name,
# location = ":memory:"
url=qdrant_url
)
return qdrant_vectorstore