Spaces:
Sleeping
Sleeping
import glob | |
import logging | |
from pathlib import Path | |
from typing import List, Optional | |
from langchain_community.vectorstores import FAISS | |
from langchain_community.document_loaders import CSVLoader | |
from langchain_huggingface import HuggingFaceEmbeddings | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.docstore.document import Document | |
# Configure logging | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s - %(levelname)s - %(message)s', | |
handlers=[ | |
logging.FileHandler('vectorize.log'), | |
logging.StreamHandler() | |
] | |
) | |
class VectorizationError(Exception): | |
"""Custom exception for vectorization-related errors""" | |
pass | |
def load_csv_documents(csv_file_path: str) -> List[Document]: | |
""" | |
Load CSV documents from the specified path. | |
Args: | |
csv_file_path (str): Path pattern to search for CSV files. | |
Returns: | |
List[Document]: A list of documents loaded from the CSV files. | |
Raises: | |
VectorizationError: If no CSV files are found or if there's an error loading them. | |
""" | |
try: | |
documents = [] | |
csv_files = list(glob.glob(csv_file_path)) | |
if not csv_files: | |
raise VectorizationError(f"No CSV files found at path: {csv_file_path}") | |
for csv_file in csv_files: | |
logging.info(f"Loading CSV file: {csv_file}") | |
loader = CSVLoader(csv_file, encoding="utf-8") | |
documents.extend(loader.load()) | |
logging.info(f"Successfully loaded {len(documents)} documents from {len(csv_files)} CSV files") | |
return documents | |
except Exception as e: | |
raise VectorizationError(f"Error loading CSV documents: {str(e)}") | |
def create_vector_store( | |
documents: List[Document], | |
embeddings_model: HuggingFaceEmbeddings, | |
output_path: str, | |
chunk_size: int = 500, | |
chunk_overlap: int = 50 | |
) -> Optional[FAISS]: | |
""" | |
Create and save a FAISS vector store from documents. | |
Args: | |
documents (List[Document]): List of documents to vectorize | |
embeddings_model (HuggingFaceEmbeddings): The embeddings model to use | |
output_path (str): Path to save the FAISS index | |
chunk_size (int, optional): Size of text chunks. Defaults to 500. | |
chunk_overlap (int, optional): Overlap between chunks. Defaults to 50. | |
Returns: | |
Optional[FAISS]: The created FAISS index if successful, None otherwise | |
""" | |
try: | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=chunk_size, | |
chunk_overlap=chunk_overlap | |
) | |
chunked_documents = text_splitter.split_documents(documents) | |
logging.info(f"Created {len(chunked_documents)} chunks from {len(documents)} documents") | |
faiss_index = FAISS.from_documents(chunked_documents, embeddings_model) | |
Path(output_path).parent.mkdir(parents=True, exist_ok=True) | |
faiss_index.save_local(output_path) | |
logging.info(f"Successfully saved FAISS index to {output_path}") | |
return faiss_index | |
except Exception as e: | |
logging.error(f"Error creating vector store: {str(e)}") | |
return None | |
def main(): | |
try: | |
# Configuration with relative paths | |
config = { | |
'msd_data_path': "./processed_data/msd/msd_processed.csv", | |
'medical_csv_path': "./processed_data/cbip/*.csv", | |
'msd_vector_path': "./vectors_data/msd_data_vec", | |
'medical_vector_path': "./vectors_data/med_data_vec", | |
'model_name': "sentence-transformers/all-MiniLM-L12-v2" | |
} | |
# Create vectors_data directory if it doesn't exist | |
Path("./vectors_data").mkdir(exist_ok=True) | |
logging.info("Starting vectorization process") | |
# Load documents | |
msd_data_documents = load_csv_documents(config['msd_data_path']) | |
medical_documents = load_csv_documents(config['medical_csv_path']) | |
# Initialize embeddings model | |
logging.info(f"Initializing embeddings model: {config['model_name']}") | |
embeddings_model = HuggingFaceEmbeddings(model_name=config['model_name']) | |
# Create vector stores | |
msd_index = create_vector_store( | |
msd_data_documents, | |
embeddings_model, | |
config['msd_vector_path'] | |
) | |
medical_index = create_vector_store( | |
medical_documents, | |
embeddings_model, | |
config['medical_vector_path'] | |
) | |
if msd_index and medical_index: | |
logging.info("Vectorization process completed successfully") | |
else: | |
logging.error("Vectorization process completed with errors") | |
except VectorizationError as ve: | |
logging.error(f"Vectorization error: {str(ve)}") | |
raise | |
except Exception as e: | |
logging.error(f"Unexpected error: {str(e)}") | |
raise | |
if __name__ == "__main__": | |
main() |