Spaces:
Running
Running
| # rag_system.py | |
| import os | |
| import logging | |
| import shutil | |
| import json | |
| from typing import Optional | |
| from rag_components import KnowledgeRAG | |
| from utils import download_and_unzip_gdrive_folder | |
| from config import ( | |
| GROQ_API_KEY, GDRIVE_SOURCES_ENABLED, GDRIVE_FOLDER_ID_OR_URL, RAG_SOURCES_DIR, | |
| RAG_STORAGE_PARENT_DIR, RAG_FAISS_INDEX_SUBDIR_NAME, RAG_LOAD_INDEX_ON_STARTUP, | |
| RAG_EMBEDDING_MODEL_NAME, RAG_LLM_MODEL_NAME, | |
| RAG_EMBEDDING_USE_GPU, RAG_LLM_TEMPERATURE, RAG_CHUNK_SIZE, RAG_CHUNK_OVERLAP, | |
| RAG_RERANKER_MODEL_NAME, RAG_RERANKER_ENABLED, RAG_CHUNKED_SOURCES_FILENAME | |
| ) | |
| logger = logging.getLogger(__name__) | |
| # MODIFIED: Added source_dir_override parameter | |
| def initialize_and_get_rag_system(force_rebuild: bool = False, source_dir_override: Optional[str] = None) -> Optional[KnowledgeRAG]: | |
| """ | |
| Initializes and returns the KnowledgeRAG system. | |
| Can force a rebuild by deleting the existing index first. | |
| Uses module-level configuration constants. | |
| Downloads sources from GDrive if configured. | |
| """ | |
| logger.info("[RAG_SYSTEM_INIT] ========== Initializing RAG System ==========") | |
| if not GROQ_API_KEY: | |
| logger.error("[RAG_SYSTEM_INIT] Groq API Key (BOT_API_KEY) not found. RAG system cannot be initialized.") | |
| return None | |
| # MODIFIED: Determine the source directory to use | |
| source_dir_to_use = source_dir_override if source_dir_override and os.path.isdir(source_dir_override) else RAG_SOURCES_DIR | |
| if source_dir_override and not os.path.isdir(source_dir_override): | |
| logger.error(f"[RAG_SYSTEM_INIT] Custom source directory override '{source_dir_override}' not found. Aborting.") | |
| return None # Or handle error appropriately | |
| logger.info(f"[RAG_SYSTEM_INIT] Using source directory: '{source_dir_to_use}'") | |
| if GDRIVE_SOURCES_ENABLED and not source_dir_override: # Only download if not using a custom directory | |
| logger.info("[RAG_SYSTEM_INIT] Google Drive sources download is ENABLED") | |
| if GDRIVE_FOLDER_ID_OR_URL: | |
| # ... (rest of GDrive logic is unchanged) | |
| logger.info(f"[RAG_SYSTEM_INIT] Downloading from Google Drive: {GDRIVE_FOLDER_ID_OR_URL}") | |
| if os.path.isdir(RAG_SOURCES_DIR): | |
| logger.info(f"[RAG_SYSTEM_INIT] Clearing existing contents of {RAG_SOURCES_DIR}") | |
| try: | |
| for item_name in os.listdir(RAG_SOURCES_DIR): | |
| item_path = os.path.join(RAG_SOURCES_DIR, item_name) | |
| if os.path.isfile(item_path) or os.path.islink(item_path): | |
| os.unlink(item_path) | |
| elif os.path.isdir(item_path): | |
| shutil.rmtree(item_path) | |
| logger.info(f"[RAG_SYSTEM_INIT] Successfully cleared {RAG_SOURCES_DIR}") | |
| except Exception as e_clear: | |
| logger.error(f"[RAG_SYSTEM_INIT] Could not clear {RAG_SOURCES_DIR}: {e_clear}") | |
| download_successful = download_and_unzip_gdrive_folder(GDRIVE_FOLDER_ID_OR_URL, RAG_SOURCES_DIR) | |
| if download_successful: | |
| logger.info(f"[RAG_SYSTEM_INIT] Successfully populated sources from Google Drive") | |
| else: | |
| logger.error("[RAG_SYSTEM_INIT] Failed to download sources from Google Drive") | |
| else: | |
| logger.warning("[RAG_SYSTEM_INIT] GDRIVE_SOURCES_ENABLED is True but GDRIVE_FOLDER_URL not set") | |
| elif not source_dir_override: | |
| logger.info("[RAG_SYSTEM_INIT] Google Drive sources download is DISABLED") | |
| faiss_index_actual_path = os.path.join(RAG_STORAGE_PARENT_DIR, RAG_FAISS_INDEX_SUBDIR_NAME) | |
| processed_files_metadata_path = os.path.join(faiss_index_actual_path, "processed_files.json") | |
| if force_rebuild: | |
| logger.info(f"[RAG_SYSTEM_INIT] Force rebuild: Deleting existing FAISS index at '{faiss_index_actual_path}'") | |
| if os.path.exists(faiss_index_actual_path): | |
| try: | |
| shutil.rmtree(faiss_index_actual_path) | |
| logger.info(f"[RAG_SYSTEM_INIT] Deleted existing FAISS index") | |
| except Exception as e_del: | |
| logger.error(f"[RAG_SYSTEM_INIT] Could not delete existing FAISS index: {e_del}", exc_info=True) | |
| try: | |
| logger.info("[RAG_SYSTEM_INIT] Creating KnowledgeRAG instance...") | |
| current_rag_instance = KnowledgeRAG( | |
| index_storage_dir=RAG_STORAGE_PARENT_DIR, | |
| embedding_model_name=RAG_EMBEDDING_MODEL_NAME, | |
| groq_model_name_for_rag=RAG_LLM_MODEL_NAME, | |
| use_gpu_for_embeddings=RAG_EMBEDDING_USE_GPU, | |
| groq_api_key_for_rag=GROQ_API_KEY, | |
| temperature=RAG_LLM_TEMPERATURE, | |
| chunk_size=RAG_CHUNK_SIZE, | |
| chunk_overlap=RAG_CHUNK_OVERLAP, | |
| reranker_model_name=RAG_RERANKER_MODEL_NAME, | |
| enable_reranker=RAG_RERANKER_ENABLED, | |
| ) | |
| operation_successful = False | |
| if RAG_LOAD_INDEX_ON_STARTUP and not force_rebuild: | |
| logger.info(f"[RAG_SYSTEM_INIT] Attempting to load index from disk") | |
| try: | |
| current_rag_instance.load_index_from_disk() | |
| operation_successful = True | |
| logger.info(f"[RAG_SYSTEM_INIT] Index loaded successfully from: {faiss_index_actual_path}") | |
| except FileNotFoundError: | |
| logger.warning(f"[RAG_SYSTEM_INIT] Pre-built index not found. Will build from source files") | |
| except Exception as e_load: | |
| logger.error(f"[RAG_SYSTEM_INIT] Error loading index: {e_load}. Will build from source files", exc_info=True) | |
| if not operation_successful: | |
| logger.info(f"[RAG_SYSTEM_INIT] Building new index from source data in '{source_dir_to_use}'") # MODIFIED: Use correct dir | |
| try: | |
| pre_chunked_path = os.path.join(RAG_STORAGE_PARENT_DIR, RAG_CHUNKED_SOURCES_FILENAME) | |
| if not os.path.exists(pre_chunked_path) and (not os.path.isdir(source_dir_to_use) or not os.listdir(source_dir_to_use)): # MODIFIED: Use correct dir | |
| logger.error(f"[RAG_SYSTEM_INIT] Neither pre-chunked JSON nor raw source files found") | |
| os.makedirs(faiss_index_actual_path, exist_ok=True) | |
| with open(os.path.join(faiss_index_actual_path, "index.faiss"), "w") as f_dummy: f_dummy.write("") | |
| with open(os.path.join(faiss_index_actual_path, "index.pkl"), "w") as f_dummy: f_dummy.write("") | |
| logger.info("[RAG_SYSTEM_INIT] Created dummy index files") | |
| current_rag_instance.processed_source_files = ["No source files found to build index."] | |
| raise FileNotFoundError(f"Sources directory '{source_dir_to_use}' is empty") # MODIFIED: Use correct dir | |
| current_rag_instance.build_index_from_source_files( | |
| source_folder_path=source_dir_to_use # MODIFIED: Use correct dir | |
| ) | |
| os.makedirs(faiss_index_actual_path, exist_ok=True) | |
| with open(processed_files_metadata_path, 'w') as f: | |
| json.dump(current_rag_instance.processed_source_files, f) | |
| operation_successful = True | |
| logger.info(f"[RAG_SYSTEM_INIT] Index built successfully from source data") | |
| except FileNotFoundError as e_fnf: | |
| logger.critical(f"[RAG_SYSTEM_INIT] FATAL: No source data found: {e_fnf}", exc_info=False) | |
| return None | |
| except ValueError as e_val: | |
| logger.critical(f"[RAG_SYSTEM_INIT] FATAL: No processable documents found: {e_val}", exc_info=False) | |
| return None | |
| except Exception as e_build: | |
| logger.critical(f"[RAG_SYSTEM_INIT] FATAL: Failed to build FAISS index: {e_build}", exc_info=True) | |
| return None | |
| if operation_successful and current_rag_instance.vector_store: | |
| logger.info("[RAG_SYSTEM_INIT] ========== RAG System Initialized Successfully ==========") | |
| return current_rag_instance | |
| else: | |
| logger.error("[RAG_SYSTEM_INIT] Index was neither loaded nor built successfully") | |
| return None | |
| except Exception as e_init_components: | |
| logger.critical(f"[RAG_SYSTEM_INIT] FATAL: Failed to initialize RAG system components: {e_init_components}", exc_info=True) | |
| return None |