Spaces:
Paused
Paused
# Dockerfile for Advanced Iranian Legal Archive System | |
# Optimized for Hugging Face Spaces deployment | |
# Stage 1: Builder | |
FROM python:3.10-slim AS builder | |
# Install build dependencies and system packages | |
RUN apt-get update && apt-get install -y \ | |
build-essential \ | |
gcc \ | |
g++ \ | |
libffi-dev \ | |
libssl-dev \ | |
wget \ | |
curl \ | |
&& rm -rf /var/lib/apt/lists/* \ | |
&& apt-get clean | |
# Upgrade pip and install build tools | |
RUN pip install --no-cache-dir --upgrade pip setuptools wheel | |
# Create virtual environment | |
RUN python -m venv /opt/venv | |
ENV PATH="/opt/venv/bin:$PATH" | |
# Copy requirements and install Python dependencies | |
WORKDIR /build | |
COPY requirements.txt . | |
RUN pip install --no-cache-dir -r requirements.txt | |
# Create cache directory for models | |
RUN mkdir -p /app/cache/transformers /app/cache/sentence-transformers | |
# Pre-download Persian BERT models (Primary classification model) | |
RUN python -c "from transformers import AutoModel, AutoTokenizer; \ | |
print('Downloading ParsBERT...'); \ | |
AutoModel.from_pretrained('HooshvareLab/bert-base-parsbert-uncased', cache_dir='/app/cache/transformers'); \ | |
AutoTokenizer.from_pretrained('HooshvareLab/bert-base-parsbert-uncased', cache_dir='/app/cache/transformers')" || true | |
# Pre-download NER model for entity recognition | |
RUN python -c "from transformers import AutoModel, AutoTokenizer; \ | |
print('Downloading Persian NER model...'); \ | |
AutoModel.from_pretrained('HooshvareLab/bert-fa-base-uncased-ner', cache_dir='/app/cache/transformers'); \ | |
AutoTokenizer.from_pretrained('HooshvareLab/bert-fa-base-uncased-ner', cache_dir='/app/cache/transformers')" || true | |
# Pre-download Persian embedding model for semantic search | |
RUN python -c "from sentence_transformers import SentenceTransformer; \ | |
print('Downloading Persian embedding model...'); \ | |
model = SentenceTransformer('xmanii/maux-gte-persian'); \ | |
model.save('/app/cache/sentence-transformers/maux-gte-persian')" || true | |
# Pre-download multilingual sentence transformer as fallback | |
RUN python -c "from sentence_transformers import SentenceTransformer; \ | |
print('Downloading multilingual model...'); \ | |
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'); \ | |
model.save('/app/cache/sentence-transformers/paraphrase-multilingual')" || true | |
# Try to download FaBERT (latest SOTA Persian model) | |
RUN python -c "from transformers import AutoModel, AutoTokenizer; \ | |
print('Downloading FaBERT...'); \ | |
AutoModel.from_pretrained('sbunlp/fabert', cache_dir='/app/cache/transformers'); \ | |
AutoTokenizer.from_pretrained('sbunlp/fabert', cache_dir='/app/cache/transformers')" || echo "FaBERT download failed, will fallback to ParsBERT" | |
# Stage 2: Production | |
FROM python:3.10-slim | |
# Install runtime dependencies | |
RUN apt-get update && apt-get install -y \ | |
sqlite3 \ | |
libsqlite3-dev \ | |
curl \ | |
&& rm -rf /var/lib/apt/lists/* \ | |
&& apt-get clean | |
# Create non-root user for security | |
RUN groupadd -g 1000 appuser && useradd -r -u 1000 -g appuser appuser | |
# Copy virtual environment from builder stage | |
COPY --from=builder /opt/venv /opt/venv | |
ENV PATH="/opt/venv/bin:$PATH" | |
# Copy pre-downloaded models | |
COPY --from=builder /app/cache /app/cache | |
# Create application directories | |
RUN mkdir -p /app/data /app/logs /app/uploads /app/tmp && \ | |
chown -R appuser:appuser /app | |
# Set working directory | |
WORKDIR /app | |
# Copy application files | |
COPY --chown=appuser:appuser . . | |
# Environment variables for Iranian Legal Archive System | |
ENV PYTHONPATH=/app | |
ENV PYTHONUNBUFFERED=1 | |
ENV HF_HOME=/app/cache | |
ENV TRANSFORMERS_CACHE=/app/cache/transformers | |
ENV TORCH_HOME=/app/cache/torch | |
ENV TOKENIZERS_PARALLELISM=false | |
ENV LOG_LEVEL=INFO | |
ENV ENVIRONMENT=production | |
# Gradio specific settings | |
ENV GRADIO_SERVER_NAME=0.0.0.0 | |
ENV GRADIO_SERVER_PORT=7860 | |
ENV GRADIO_SHARE=false | |
# Memory optimization for HF Spaces | |
ENV PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128 | |
ENV OMP_NUM_THREADS=2 | |
ENV MKL_NUM_THREADS=2 | |
# Switch to non-root user | |
USER appuser | |
# Create data directories with proper permissions | |
RUN mkdir -p data/cache_system.sqlite data/iranian_legal_archive_advanced.sqlite data/embeddings_cache.pkl data/faiss_index.bin | |
# Health check | |
HEALTHCHECK --interval=30s --timeout=30s --start-period=60s --retries=3 \ | |
CMD curl -f http://localhost:7860 || exit 1 | |
# Expose Gradio port | |
EXPOSE 7860 | |
# Command to run the application (matching README.md app_file) | |
CMD ["python", "persian_legal_scraper.py"] | |