# Dockerfile for Advanced Iranian Legal Archive System # Optimized for Hugging Face Spaces deployment # Stage 1: Builder FROM python:3.10-slim AS builder # Install build dependencies and system packages RUN apt-get update && apt-get install -y \ build-essential \ gcc \ g++ \ libffi-dev \ libssl-dev \ wget \ curl \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean # Upgrade pip and install build tools RUN pip install --no-cache-dir --upgrade pip setuptools wheel # Create virtual environment RUN python -m venv /opt/venv ENV PATH="/opt/venv/bin:$PATH" # Copy requirements and install Python dependencies WORKDIR /build COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt # Create cache directory for models RUN mkdir -p /app/cache/transformers /app/cache/sentence-transformers # Pre-download Persian BERT models (Primary classification model) RUN python -c "from transformers import AutoModel, AutoTokenizer; \ print('Downloading ParsBERT...'); \ AutoModel.from_pretrained('HooshvareLab/bert-base-parsbert-uncased', cache_dir='/app/cache/transformers'); \ AutoTokenizer.from_pretrained('HooshvareLab/bert-base-parsbert-uncased', cache_dir='/app/cache/transformers')" || true # Pre-download NER model for entity recognition RUN python -c "from transformers import AutoModel, AutoTokenizer; \ print('Downloading Persian NER model...'); \ AutoModel.from_pretrained('HooshvareLab/bert-fa-base-uncased-ner', cache_dir='/app/cache/transformers'); \ AutoTokenizer.from_pretrained('HooshvareLab/bert-fa-base-uncased-ner', cache_dir='/app/cache/transformers')" || true # Pre-download Persian embedding model for semantic search RUN python -c "from sentence_transformers import SentenceTransformer; \ print('Downloading Persian embedding model...'); \ model = SentenceTransformer('xmanii/maux-gte-persian'); \ model.save('/app/cache/sentence-transformers/maux-gte-persian')" || true # Pre-download multilingual sentence transformer as fallback RUN python -c "from sentence_transformers import SentenceTransformer; \ print('Downloading multilingual model...'); \ model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'); \ model.save('/app/cache/sentence-transformers/paraphrase-multilingual')" || true # Try to download FaBERT (latest SOTA Persian model) RUN python -c "from transformers import AutoModel, AutoTokenizer; \ print('Downloading FaBERT...'); \ AutoModel.from_pretrained('sbunlp/fabert', cache_dir='/app/cache/transformers'); \ AutoTokenizer.from_pretrained('sbunlp/fabert', cache_dir='/app/cache/transformers')" || echo "FaBERT download failed, will fallback to ParsBERT" # Stage 2: Production FROM python:3.10-slim # Install runtime dependencies RUN apt-get update && apt-get install -y \ sqlite3 \ libsqlite3-dev \ curl \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean # Create non-root user for security RUN groupadd -g 1000 appuser && useradd -r -u 1000 -g appuser appuser # Copy virtual environment from builder stage COPY --from=builder /opt/venv /opt/venv ENV PATH="/opt/venv/bin:$PATH" # Copy pre-downloaded models COPY --from=builder /app/cache /app/cache # Create application directories RUN mkdir -p /app/data /app/logs /app/uploads /app/tmp && \ chown -R appuser:appuser /app # Set working directory WORKDIR /app # Copy application files COPY --chown=appuser:appuser . . # Environment variables for Iranian Legal Archive System ENV PYTHONPATH=/app ENV PYTHONUNBUFFERED=1 ENV HF_HOME=/app/cache ENV TRANSFORMERS_CACHE=/app/cache/transformers ENV TORCH_HOME=/app/cache/torch ENV TOKENIZERS_PARALLELISM=false ENV LOG_LEVEL=INFO ENV ENVIRONMENT=production # Gradio specific settings ENV GRADIO_SERVER_NAME=0.0.0.0 ENV GRADIO_SERVER_PORT=7860 ENV GRADIO_SHARE=false # Memory optimization for HF Spaces ENV PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128 ENV OMP_NUM_THREADS=2 ENV MKL_NUM_THREADS=2 # Switch to non-root user USER appuser # Create data directories with proper permissions RUN mkdir -p data/cache_system.sqlite data/iranian_legal_archive_advanced.sqlite data/embeddings_cache.pkl data/faiss_index.bin # Health check HEALTHCHECK --interval=30s --timeout=30s --start-period=60s --retries=3 \ CMD curl -f http://localhost:7860 || exit 1 # Expose Gradio port EXPOSE 7860 # Command to run the application (matching README.md app_file) CMD ["python", "persian_legal_scraper.py"]