Spaces:
Running
Running
Upload 4 files
Browse files
app.py
CHANGED
|
@@ -30,13 +30,15 @@ load_dotenv()
|
|
| 30 |
from llm_fallback import get_groq_fallback_response
|
| 31 |
from rag_system import initialize_and_get_rag_system
|
| 32 |
from rag_components import KnowledgeRAG
|
| 33 |
-
from utils import download_and_unzip_gdrive_file # MODIFIED: Import the new utility
|
| 34 |
from config import (
|
| 35 |
RAG_SOURCES_DIR,
|
| 36 |
RAG_STORAGE_PARENT_DIR,
|
| 37 |
RAG_CHUNKED_SOURCES_FILENAME,
|
| 38 |
-
GDRIVE_INDEX_ENABLED,
|
| 39 |
-
GDRIVE_INDEX_ID_OR_URL
|
|
|
|
|
|
|
| 40 |
)
|
| 41 |
|
| 42 |
# Setup logging (remains global for the app)
|
|
@@ -1082,7 +1084,23 @@ if __name__ == '__main__':
|
|
| 1082 |
TEXT_EXTRACTIONS_DIR]:
|
| 1083 |
os.makedirs(folder_path, exist_ok=True)
|
| 1084 |
|
| 1085 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1086 |
load_users_from_csv()
|
| 1087 |
|
| 1088 |
load_qa_data_on_startup()
|
|
|
|
| 30 |
from llm_fallback import get_groq_fallback_response
|
| 31 |
from rag_system import initialize_and_get_rag_system
|
| 32 |
from rag_components import KnowledgeRAG
|
| 33 |
+
from utils import download_and_unzip_gdrive_file, download_gdrive_file # MODIFIED: Import the new utility
|
| 34 |
from config import (
|
| 35 |
RAG_SOURCES_DIR,
|
| 36 |
RAG_STORAGE_PARENT_DIR,
|
| 37 |
RAG_CHUNKED_SOURCES_FILENAME,
|
| 38 |
+
GDRIVE_INDEX_ENABLED,
|
| 39 |
+
GDRIVE_INDEX_ID_OR_URL,
|
| 40 |
+
GDRIVE_USERS_CSV_ENABLED, # NEW
|
| 41 |
+
GDRIVE_USERS_CSV_ID_OR_URL # NEW
|
| 42 |
)
|
| 43 |
|
| 44 |
# Setup logging (remains global for the app)
|
|
|
|
| 1084 |
TEXT_EXTRACTIONS_DIR]:
|
| 1085 |
os.makedirs(folder_path, exist_ok=True)
|
| 1086 |
|
| 1087 |
+
# --- NEW: Download users.csv from GDrive if enabled ---
|
| 1088 |
+
if GDRIVE_USERS_CSV_ENABLED:
|
| 1089 |
+
logger.info("[GDRIVE_USERS_DOWNLOAD] Google Drive users.csv download is ENABLED.")
|
| 1090 |
+
if GDRIVE_USERS_CSV_ID_OR_URL:
|
| 1091 |
+
users_csv_target_path = os.path.join(_APP_BASE_DIR, 'assets', 'users.csv')
|
| 1092 |
+
logger.info(f"[GDRIVE_USERS_DOWNLOAD] Attempting to download users.csv to: {users_csv_target_path}")
|
| 1093 |
+
download_successful = download_gdrive_file(GDRIVE_USERS_CSV_ID_OR_URL, users_csv_target_path)
|
| 1094 |
+
if download_successful:
|
| 1095 |
+
logger.info("[GDRIVE_USERS_DOWNLOAD] Successfully downloaded users.csv.")
|
| 1096 |
+
else:
|
| 1097 |
+
logger.error("[GDRIVE_USERS_DOWNLOAD] Failed to download users.csv from Google Drive. Will use existing file or fallback.")
|
| 1098 |
+
else:
|
| 1099 |
+
logger.warning("[GDRIVE_USERS_DOWNLOAD] GDRIVE_USERS_CSV_ENABLED is True, but GDRIVE_USERS_CSV_URL is not set.")
|
| 1100 |
+
else:
|
| 1101 |
+
logger.info("[GDRIVE_USERS_DOWNLOAD] Google Drive users.csv download is DISABLED.")
|
| 1102 |
+
|
| 1103 |
+
# Load users from CSV at startup (will use the downloaded file if successful)
|
| 1104 |
load_users_from_csv()
|
| 1105 |
|
| 1106 |
load_qa_data_on_startup()
|
config.py
CHANGED
|
@@ -48,13 +48,18 @@ RAG_CHUNK_OVERLAP = int(os.getenv("RAG_CHUNK_OVERLAP", 150))
|
|
| 48 |
RAG_RERANKER_MODEL_NAME = os.getenv("RAG_RERANKER_MODEL", "jinaai/jina-reranker-v2-base-multilingual")
|
| 49 |
RAG_RERANKER_ENABLED = os.getenv("RAG_RERANKER_ENABLED", "True").lower() == "true"
|
| 50 |
|
|
|
|
| 51 |
GDRIVE_SOURCES_ENABLED = os.getenv("GDRIVE_SOURCES_ENABLED", "False").lower() == "true"
|
| 52 |
GDRIVE_FOLDER_ID_OR_URL = os.getenv("GDRIVE_FOLDER_URL")
|
| 53 |
|
| 54 |
-
#
|
| 55 |
GDRIVE_INDEX_ENABLED = os.getenv("GDRIVE_INDEX_ENABLED", "False").lower() == "true"
|
| 56 |
GDRIVE_INDEX_ID_OR_URL = os.getenv("GDRIVE_INDEX_URL")
|
| 57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
# Detailed logging configuration
|
| 60 |
RAG_DETAILED_LOGGING = os.getenv("RAG_DETAILED_LOGGING", "True").lower() == "true"
|
|
@@ -67,4 +72,5 @@ logger.info(f"Reranker Model: {RAG_RERANKER_MODEL_NAME}")
|
|
| 67 |
logger.info(f"Retrieval Pipeline: Initial Fetch K={RAG_INITIAL_FETCH_K}, Reranker Final K={RAG_RERANKER_K}")
|
| 68 |
logger.info(f"Detailed Logging: {'ENABLED' if RAG_DETAILED_LOGGING else 'DISABLED'}")
|
| 69 |
logger.info(f"GDrive Sources Download: {'ENABLED' if GDRIVE_SOURCES_ENABLED else 'DISABLED'}")
|
| 70 |
-
logger.info(f"GDrive Pre-built Index Download: {'ENABLED' if GDRIVE_INDEX_ENABLED else 'DISABLED'}")
|
|
|
|
|
|
| 48 |
RAG_RERANKER_MODEL_NAME = os.getenv("RAG_RERANKER_MODEL", "jinaai/jina-reranker-v2-base-multilingual")
|
| 49 |
RAG_RERANKER_ENABLED = os.getenv("RAG_RERANKER_ENABLED", "True").lower() == "true"
|
| 50 |
|
| 51 |
+
# GDrive configuration for RAG sources
|
| 52 |
GDRIVE_SOURCES_ENABLED = os.getenv("GDRIVE_SOURCES_ENABLED", "False").lower() == "true"
|
| 53 |
GDRIVE_FOLDER_ID_OR_URL = os.getenv("GDRIVE_FOLDER_URL")
|
| 54 |
|
| 55 |
+
# GDrive configuration for downloading a pre-built FAISS index
|
| 56 |
GDRIVE_INDEX_ENABLED = os.getenv("GDRIVE_INDEX_ENABLED", "False").lower() == "true"
|
| 57 |
GDRIVE_INDEX_ID_OR_URL = os.getenv("GDRIVE_INDEX_URL")
|
| 58 |
|
| 59 |
+
# --- NEW: GDrive configuration for downloading users.csv ---
|
| 60 |
+
GDRIVE_USERS_CSV_ENABLED = os.getenv("GDRIVE_USERS_CSV_ENABLED", "False").lower() == "true"
|
| 61 |
+
GDRIVE_USERS_CSV_ID_OR_URL = os.getenv("GDRIVE_USERS_CSV_URL")
|
| 62 |
+
|
| 63 |
|
| 64 |
# Detailed logging configuration
|
| 65 |
RAG_DETAILED_LOGGING = os.getenv("RAG_DETAILED_LOGGING", "True").lower() == "true"
|
|
|
|
| 72 |
logger.info(f"Retrieval Pipeline: Initial Fetch K={RAG_INITIAL_FETCH_K}, Reranker Final K={RAG_RERANKER_K}")
|
| 73 |
logger.info(f"Detailed Logging: {'ENABLED' if RAG_DETAILED_LOGGING else 'DISABLED'}")
|
| 74 |
logger.info(f"GDrive Sources Download: {'ENABLED' if GDRIVE_SOURCES_ENABLED else 'DISABLED'}")
|
| 75 |
+
logger.info(f"GDrive Pre-built Index Download: {'ENABLED' if GDRIVE_INDEX_ENABLED else 'DISABLED'}")
|
| 76 |
+
logger.info(f"GDrive users.csv Download: {'ENABLED' if GDRIVE_USERS_CSV_ENABLED else 'DISABLED'}")
|
utils.py
CHANGED
|
@@ -66,6 +66,36 @@ def get_id_from_gdrive_input(url_or_id: str) -> Optional[str]:
|
|
| 66 |
logger.warning(f"Could not reliably extract Google Drive ID from input: {url_or_id}")
|
| 67 |
return None
|
| 68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
def download_and_unzip_gdrive_file(file_id_or_url: str, target_extraction_dir: str) -> bool:
|
| 71 |
"""
|
|
|
|
| 66 |
logger.warning(f"Could not reliably extract Google Drive ID from input: {url_or_id}")
|
| 67 |
return None
|
| 68 |
|
| 69 |
+
def download_gdrive_file(file_id_or_url: str, target_path: str) -> bool:
|
| 70 |
+
"""
|
| 71 |
+
Downloads a single file from Google Drive to a specific path.
|
| 72 |
+
"""
|
| 73 |
+
logger.info(f"[GDRIVE_SINGLE_FILE] Attempting to download file. Input: {file_id_or_url}")
|
| 74 |
+
|
| 75 |
+
file_id = get_id_from_gdrive_input(file_id_or_url)
|
| 76 |
+
if not file_id:
|
| 77 |
+
logger.error(f"[GDRIVE_SINGLE_FILE] Invalid Google Drive File ID or URL provided: {file_id_or_url}")
|
| 78 |
+
return False
|
| 79 |
+
|
| 80 |
+
try:
|
| 81 |
+
# Ensure the target directory exists before downloading
|
| 82 |
+
target_dir = os.path.dirname(target_path)
|
| 83 |
+
os.makedirs(target_dir, exist_ok=True)
|
| 84 |
+
|
| 85 |
+
logger.info(f"[GDRIVE_SINGLE_FILE] Downloading file ID: {file_id} to path: {target_path}")
|
| 86 |
+
# Use gdown to download directly to the target file path, fuzzy=True helps with some permissions
|
| 87 |
+
gdown.download(id=file_id, output=target_path, quiet=False, fuzzy=True)
|
| 88 |
+
|
| 89 |
+
if not os.path.exists(target_path) or os.path.getsize(target_path) == 0:
|
| 90 |
+
logger.error("[GDRIVE_SINGLE_FILE] Download failed or the resulting file is empty.")
|
| 91 |
+
return False
|
| 92 |
+
|
| 93 |
+
logger.info(f"[GDRIVE_SINGLE_FILE] Download successful.")
|
| 94 |
+
return True
|
| 95 |
+
|
| 96 |
+
except Exception as e:
|
| 97 |
+
logger.error(f"[GDRIVE_SINGLE_FILE] An error occurred during download: {e}", exc_info=True)
|
| 98 |
+
return False
|
| 99 |
|
| 100 |
def download_and_unzip_gdrive_file(file_id_or_url: str, target_extraction_dir: str) -> bool:
|
| 101 |
"""
|