auro_chatbot_backend / app /embed_documents.py
vip11017's picture
adjusted embedding now to e5-base
86b6056
# %%
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance
from langchain_core.documents import Document
from langchain_qdrant import QdrantVectorStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
import os
from pathlib import Path
from uuid import uuid4
# %%
QDRANT_URL = os.getenv('QDRANT_URL')
QDRANT_API_KEY = os.getenv('QDRANT_API_KEY')
# %%
FAQ_COLLECTION = "faqs"
BLOGS_COLLECTION = "blogs"
TECHNOLOGY_COLLECTION = "technology"
REVOLUTION_COLLECTION = "revolution"
SUPPORT_COLLECTION = "support"
PRODUCT_COLLECTION = "product"
# %%
client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)
embedding_model = "intfloat/e5-base-v2"
embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
# %%
data_directory = Path(__file__).parent / "data"
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=64)
# %%
#Delete Collection
def delete_collection(collection_name):
if client.collection_exists(collection_name):
client.delete_collection(collection_name)
print(f"Collection '{collection_name}' deleted.")
# %%
#Create Collection
def create_collection(collection_name):
if not client.collection_exists(collection_name):
client.create_collection(
collection_name=collection_name,
vectors_config=VectorParams(size=1024, distance=Distance.COSINE),
)
print(f"Created Collection: {collection_name}")
# %%
def load_documents_from_folder(folder_path):
documents = []
for file_path in folder_path.rglob("*.txt"):
with open(file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
if not lines:
print(f"{file_path} is empty")
continue
source_url = lines[0].replace("Source URL:","").strip()
content = "".join(lines[1:]).strip()
topic = file_path.parent.name
if content:
doc = Document(
page_content=content,
metadata={'source': source_url,
'topic': topic}
)
documents.append(doc)
for file_path in folder_path.rglob("*.pdf"):
try:
loader = PyPDFLoader(file_path)
docs = loader.load()
for doc in docs:
doc.metadata["topic"] = file_path.parent.name
documents.extend(docs)
except Exception as e:
print(f"Failed to load PDF {file_path}: {e}")
return documents
# %%
def split_and_upload_to_qdrant(collection_name, documents):
splits = text_splitter.split_documents(documents)
uuids = [str(uuid4()) for _ in range(len(splits))]
vector_store = QdrantVectorStore(
client=client,
collection_name=collection_name,
embedding=embeddings
)
vector_store.add_documents(documents=splits, ids=uuids)
print(f"Uploaded {len(splits)} chunks to {collection_name}")
# %%
sub_folders = [sub_folder for sub_folder in data_directory.iterdir() if sub_folder.is_dir()]
for topic in sub_folders:
collection_name = topic.name
print(f"Processing: {topic.name}")
delete_collection(collection_name)
create_collection(collection_name)
docs = load_documents_from_folder(topic)
print(f"Loaded {len(docs)} docs from {topic}")
if docs:
split_and_upload_to_qdrant(collection_name, docs)
print('\n')
# %%
"""collection_name = 'wellness_docs'
delete_collection(collection_name)
create_collection(collection_name)
sub_folders = [sub_folder for sub_folder in data_directory.iterdir() if sub_folder.is_dir()]
for topic in sub_folders:
print(f"Processing: {topic.name}")
docs = load_documents_from_folder(topic)
print(f"Loaded {len(docs)} docs from {topic}")
if docs:
split_and_upload_to_qdrant(collection_name, docs)
print('\n')"""