import os import sys import logging from pathlib import Path # Setup logging cho HuggingFace environment def setup_logging(): """Setup logging phù hợp với HF environment""" if os.getenv("SPACE_ID"): # Trên HF, chỉ log ra console logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[logging.StreamHandler()] ) else: # Local, có thể ghi file logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(), logging.FileHandler("embed_data.log", encoding='utf-8') ] ) setup_logging() logger = logging.getLogger(__name__) def setup_data(): """Setup and embed data on startup""" try: logger.info("Starting data setup process...") # SKIP auto embedding trên HuggingFace if os.getenv("SPACE_ID"): logger.info("HuggingFace environment detected") logger.info("⏭Skipping auto-embedding due to PyTorch meta tensor issues") logger.info("Use /api/embed-data endpoint to manually embed data") # Chỉ test basic init try: logger.info("Testing basic model initialization...") from core.embedding_model import get_embedding_model embedding_model = get_embedding_model() count = embedding_model.count() logger.info(f"ChromaDB initialized with {count} documents") logger.info("Basic initialization successful") except Exception as e: logger.error(f"Basic initialization failed: {e}") return # Local environment - chạy embedding bình thường logger.info("Local environment - proceeding with auto-embedding") # Kiểm tra data directory data_dir = "data" if not os.path.exists(data_dir): logger.error(f"Data directory {data_dir} not found!") return # Import và chạy embedding from core.embedding_model import get_embedding_model embedding_model = get_embedding_model() current_count = embedding_model.count() if current_count < 50: logger.info("Starting embedding process...") from scripts.embed_data import embed_all_data result = embed_all_data(data_dir, force=False) logger.info(f"Embedding completed: {result}") else: logger.info("⏭Data already embedded") except Exception as e: logger.error(f"Error in setup_data: {e}") import traceback logger.error(traceback.format_exc()) if __name__ == "__main__": setup_data()