Hoghoghi / dockerfile
Really-amin's picture
Upload 2 files
546331e verified
raw
history blame
4.67 kB
# Dockerfile for Advanced Iranian Legal Archive System
# Optimized for Hugging Face Spaces deployment
# Stage 1: Builder
FROM python:3.10-slim AS builder
# Install build dependencies and system packages
RUN apt-get update && apt-get install -y \
build-essential \
gcc \
g++ \
libffi-dev \
libssl-dev \
wget \
curl \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean
# Upgrade pip and install build tools
RUN pip install --no-cache-dir --upgrade pip setuptools wheel
# Create virtual environment
RUN python -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Copy requirements and install Python dependencies
WORKDIR /build
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Create cache directory for models
RUN mkdir -p /app/cache/transformers /app/cache/sentence-transformers
# Pre-download Persian BERT models (Primary classification model)
RUN python -c "from transformers import AutoModel, AutoTokenizer; \
print('Downloading ParsBERT...'); \
AutoModel.from_pretrained('HooshvareLab/bert-base-parsbert-uncased', cache_dir='/app/cache/transformers'); \
AutoTokenizer.from_pretrained('HooshvareLab/bert-base-parsbert-uncased', cache_dir='/app/cache/transformers')" || true
# Pre-download NER model for entity recognition
RUN python -c "from transformers import AutoModel, AutoTokenizer; \
print('Downloading Persian NER model...'); \
AutoModel.from_pretrained('HooshvareLab/bert-fa-base-uncased-ner', cache_dir='/app/cache/transformers'); \
AutoTokenizer.from_pretrained('HooshvareLab/bert-fa-base-uncased-ner', cache_dir='/app/cache/transformers')" || true
# Pre-download Persian embedding model for semantic search
RUN python -c "from sentence_transformers import SentenceTransformer; \
print('Downloading Persian embedding model...'); \
model = SentenceTransformer('xmanii/maux-gte-persian'); \
model.save('/app/cache/sentence-transformers/maux-gte-persian')" || true
# Pre-download multilingual sentence transformer as fallback
RUN python -c "from sentence_transformers import SentenceTransformer; \
print('Downloading multilingual model...'); \
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'); \
model.save('/app/cache/sentence-transformers/paraphrase-multilingual')" || true
# Try to download FaBERT (latest SOTA Persian model)
RUN python -c "from transformers import AutoModel, AutoTokenizer; \
print('Downloading FaBERT...'); \
AutoModel.from_pretrained('sbunlp/fabert', cache_dir='/app/cache/transformers'); \
AutoTokenizer.from_pretrained('sbunlp/fabert', cache_dir='/app/cache/transformers')" || echo "FaBERT download failed, will fallback to ParsBERT"
# Stage 2: Production
FROM python:3.10-slim
# Install runtime dependencies
RUN apt-get update && apt-get install -y \
sqlite3 \
libsqlite3-dev \
curl \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean
# Create non-root user for security
RUN groupadd -g 1000 appuser && useradd -r -u 1000 -g appuser appuser
# Copy virtual environment from builder stage
COPY --from=builder /opt/venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Copy pre-downloaded models
COPY --from=builder /app/cache /app/cache
# Create application directories
RUN mkdir -p /app/data /app/logs /app/uploads /app/tmp && \
chown -R appuser:appuser /app
# Set working directory
WORKDIR /app
# Copy application files
COPY --chown=appuser:appuser . .
# Environment variables for Iranian Legal Archive System
ENV PYTHONPATH=/app
ENV PYTHONUNBUFFERED=1
ENV HF_HOME=/app/cache
ENV TRANSFORMERS_CACHE=/app/cache/transformers
ENV TORCH_HOME=/app/cache/torch
ENV TOKENIZERS_PARALLELISM=false
ENV LOG_LEVEL=INFO
ENV ENVIRONMENT=production
# Gradio specific settings
ENV GRADIO_SERVER_NAME=0.0.0.0
ENV GRADIO_SERVER_PORT=7860
ENV GRADIO_SHARE=false
# Memory optimization for HF Spaces
ENV PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128
ENV OMP_NUM_THREADS=2
ENV MKL_NUM_THREADS=2
# Switch to non-root user
USER appuser
# Create data directories with proper permissions
RUN mkdir -p data/cache_system.sqlite data/iranian_legal_archive_advanced.sqlite data/embeddings_cache.pkl data/faiss_index.bin
# Health check
HEALTHCHECK --interval=30s --timeout=30s --start-period=60s --retries=3 \
CMD curl -f http://localhost:7860 || exit 1
# Expose Gradio port
EXPOSE 7860
# Command to run the application (matching README.md app_file)
CMD ["python", "persian_legal_scraper.py"]