Spaces:
Sleeping
Sleeping
| import logging | |
| from data.load_dataset import load_data | |
| from retriever.embed_documents import embed_documents | |
| from retriever.chunk_documents import chunk_documents | |
| loaded_datasets = set() # Keep track of loaded datasets | |
| def load_selected_datasets(selected_datasets, config) -> str: | |
| """Load, chunk, and embed selected datasets.""" | |
| global loaded_datasets | |
| if not selected_datasets: | |
| return "No dataset selected." | |
| all_chunked_documents = [] | |
| datasets = {} | |
| for data_set_name in selected_datasets: | |
| logging.info(f"Loading dataset: {data_set_name}") | |
| datasets[data_set_name] = load_data(data_set_name) | |
| # Set chunk size | |
| chunk_size = 4000 if data_set_name == 'cuad' else 1000 # Example chunk sizes | |
| # Chunk documents | |
| chunked_documents = chunk_documents(datasets[data_set_name], chunk_size=chunk_size, chunk_overlap=200) | |
| all_chunked_documents.extend(chunked_documents) | |
| # Logging final count | |
| logging.info(f"Total chunked documents: {len(all_chunked_documents)}") | |
| # Mark dataset as loaded | |
| loaded_datasets.add(data_set_name) | |
| # Embed documents | |
| config.vector_store = embed_documents(all_chunked_documents) | |
| logging.info("Documents embeding completed.") | |
| # **🔹 Refresh loaded datasets after loading** | |
| config.loaded_datasets = config.detect_loaded_datasets() | |
| return loaded_datasets #f"Loaded datasets: {', '.join(loaded_datasets)}" |