BackEnd / startup.py
HaRin2806
fix bug
7052598
import os
import sys
import logging
from pathlib import Path
# Setup logging cho HuggingFace environment
def setup_logging():
"""Setup logging phù hợp với HF environment"""
if os.getenv("SPACE_ID"):
# Trên HF, chỉ log ra console
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[logging.StreamHandler()]
)
else:
# Local, có thể ghi file
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(),
logging.FileHandler("embed_data.log", encoding='utf-8')
]
)
setup_logging()
logger = logging.getLogger(__name__)
def setup_data():
"""Setup and embed data on startup"""
try:
logger.info("Starting data setup process...")
# SKIP auto embedding trên HuggingFace
if os.getenv("SPACE_ID"):
logger.info("HuggingFace environment detected")
logger.info("⏭Skipping auto-embedding due to PyTorch meta tensor issues")
logger.info("Use /api/embed-data endpoint to manually embed data")
# Chỉ test basic init
try:
logger.info("Testing basic model initialization...")
from core.embedding_model import get_embedding_model
embedding_model = get_embedding_model()
count = embedding_model.count()
logger.info(f"ChromaDB initialized with {count} documents")
logger.info("Basic initialization successful")
except Exception as e:
logger.error(f"Basic initialization failed: {e}")
return
# Local environment - chạy embedding bình thường
logger.info("Local environment - proceeding with auto-embedding")
# Kiểm tra data directory
data_dir = "data"
if not os.path.exists(data_dir):
logger.error(f"Data directory {data_dir} not found!")
return
# Import và chạy embedding
from core.embedding_model import get_embedding_model
embedding_model = get_embedding_model()
current_count = embedding_model.count()
if current_count < 50:
logger.info("Starting embedding process...")
from scripts.embed_data import embed_all_data
result = embed_all_data(data_dir, force=False)
logger.info(f"Embedding completed: {result}")
else:
logger.info("⏭Data already embedded")
except Exception as e:
logger.error(f"Error in setup_data: {e}")
import traceback
logger.error(traceback.format_exc())
if __name__ == "__main__":
setup_data()