Spaces:
Running
Running
| # %% | |
| from qdrant_client import QdrantClient | |
| from qdrant_client.models import VectorParams, Distance | |
| from langchain_core.documents import Document | |
| from langchain_qdrant import QdrantVectorStore | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain_community.document_loaders import PyPDFLoader | |
| import os | |
| from pathlib import Path | |
| from uuid import uuid4 | |
| # %% | |
| QDRANT_URL = os.getenv('QDRANT_URL') | |
| QDRANT_API_KEY = os.getenv('QDRANT_API_KEY') | |
| # %% | |
| FAQ_COLLECTION = "faqs" | |
| BLOGS_COLLECTION = "blogs" | |
| TECHNOLOGY_COLLECTION = "technology" | |
| REVOLUTION_COLLECTION = "revolution" | |
| SUPPORT_COLLECTION = "support" | |
| PRODUCT_COLLECTION = "product" | |
| # %% | |
| client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY) | |
| embedding_model = "intfloat/e5-base-v2" | |
| embeddings = HuggingFaceEmbeddings(model_name=embedding_model) | |
| # %% | |
| data_directory = Path(__file__).parent / "data" | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=64) | |
| # %% | |
| #Delete Collection | |
| def delete_collection(collection_name): | |
| if client.collection_exists(collection_name): | |
| client.delete_collection(collection_name) | |
| print(f"Collection '{collection_name}' deleted.") | |
| # %% | |
| #Create Collection | |
| def create_collection(collection_name): | |
| if not client.collection_exists(collection_name): | |
| client.create_collection( | |
| collection_name=collection_name, | |
| vectors_config=VectorParams(size=1024, distance=Distance.COSINE), | |
| ) | |
| print(f"Created Collection: {collection_name}") | |
| # %% | |
| def load_documents_from_folder(folder_path): | |
| documents = [] | |
| for file_path in folder_path.rglob("*.txt"): | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| lines = f.readlines() | |
| if not lines: | |
| print(f"{file_path} is empty") | |
| continue | |
| source_url = lines[0].replace("Source URL:","").strip() | |
| content = "".join(lines[1:]).strip() | |
| topic = file_path.parent.name | |
| if content: | |
| doc = Document( | |
| page_content=content, | |
| metadata={'source': source_url, | |
| 'topic': topic} | |
| ) | |
| documents.append(doc) | |
| for file_path in folder_path.rglob("*.pdf"): | |
| try: | |
| loader = PyPDFLoader(file_path) | |
| docs = loader.load() | |
| for doc in docs: | |
| doc.metadata["topic"] = file_path.parent.name | |
| documents.extend(docs) | |
| except Exception as e: | |
| print(f"Failed to load PDF {file_path}: {e}") | |
| return documents | |
| # %% | |
| def split_and_upload_to_qdrant(collection_name, documents): | |
| splits = text_splitter.split_documents(documents) | |
| uuids = [str(uuid4()) for _ in range(len(splits))] | |
| vector_store = QdrantVectorStore( | |
| client=client, | |
| collection_name=collection_name, | |
| embedding=embeddings | |
| ) | |
| vector_store.add_documents(documents=splits, ids=uuids) | |
| print(f"Uploaded {len(splits)} chunks to {collection_name}") | |
| # %% | |
| sub_folders = [sub_folder for sub_folder in data_directory.iterdir() if sub_folder.is_dir()] | |
| for topic in sub_folders: | |
| collection_name = topic.name | |
| print(f"Processing: {topic.name}") | |
| delete_collection(collection_name) | |
| create_collection(collection_name) | |
| docs = load_documents_from_folder(topic) | |
| print(f"Loaded {len(docs)} docs from {topic}") | |
| if docs: | |
| split_and_upload_to_qdrant(collection_name, docs) | |
| print('\n') | |
| # %% | |
| """collection_name = 'wellness_docs' | |
| delete_collection(collection_name) | |
| create_collection(collection_name) | |
| sub_folders = [sub_folder for sub_folder in data_directory.iterdir() if sub_folder.is_dir()] | |
| for topic in sub_folders: | |
| print(f"Processing: {topic.name}") | |
| docs = load_documents_from_folder(topic) | |
| print(f"Loaded {len(docs)} docs from {topic}") | |
| if docs: | |
| split_and_upload_to_qdrant(collection_name, docs) | |
| print('\n')""" | |