Spaces:
Paused
Paused
Upload 2 files
Browse files- docker-compose.yaml +175 -9
- dockerfile +115 -20
docker-compose.yaml
CHANGED
@@ -1,19 +1,185 @@
|
|
1 |
-
|
2 |
-
version: "3.8"
|
3 |
|
4 |
services:
|
5 |
-
|
|
|
6 |
build:
|
7 |
context: .
|
8 |
dockerfile: Dockerfile
|
9 |
-
container_name:
|
10 |
restart: unless-stopped
|
11 |
ports:
|
12 |
- "7860:7860"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
networks:
|
14 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
volumes:
|
16 |
-
-
|
17 |
-
-
|
18 |
-
|
19 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
version: '3.8'
|
|
|
2 |
|
3 |
services:
|
4 |
+
# Persian Legal Scraper Main Application
|
5 |
+
persian-legal-scraper:
|
6 |
build:
|
7 |
context: .
|
8 |
dockerfile: Dockerfile
|
9 |
+
container_name: persian-legal-scraper-app
|
10 |
restart: unless-stopped
|
11 |
ports:
|
12 |
- "7860:7860"
|
13 |
+
environment:
|
14 |
+
# Application Settings
|
15 |
+
- PYTHONPATH=/app
|
16 |
+
- PYTHONUNBUFFERED=1
|
17 |
+
- LOG_LEVEL=INFO
|
18 |
+
- ENVIRONMENT=production
|
19 |
+
|
20 |
+
# Gradio Configuration
|
21 |
+
- GRADIO_SERVER_NAME=0.0.0.0
|
22 |
+
- GRADIO_SERVER_PORT=7860
|
23 |
+
- GRADIO_SHARE=false
|
24 |
+
- GRADIO_THEME=default
|
25 |
+
|
26 |
+
# AI/ML Model Settings
|
27 |
+
- HF_HOME=/app/cache
|
28 |
+
- TRANSFORMERS_CACHE=/app/cache/transformers
|
29 |
+
- TORCH_HOME=/app/cache/torch
|
30 |
+
- TOKENIZERS_PARALLELISM=false
|
31 |
+
|
32 |
+
# Performance Optimization
|
33 |
+
- PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128
|
34 |
+
- OMP_NUM_THREADS=2
|
35 |
+
- MKL_NUM_THREADS=2
|
36 |
+
|
37 |
+
# Database Configuration
|
38 |
+
- DATABASE_PATH=/app/data/iranian_legal_archive_advanced.sqlite
|
39 |
+
- CACHE_DB_PATH=/app/data/cache_system.sqlite
|
40 |
+
- EMBEDDINGS_CACHE_PATH=/app/data/embeddings_cache.pkl
|
41 |
+
- VECTOR_INDEX_PATH=/app/data/faiss_index.bin
|
42 |
+
|
43 |
+
# Security Settings
|
44 |
+
- ANTI_DDOS_ENABLED=true
|
45 |
+
- MAX_REQUESTS_PER_HOUR=100
|
46 |
+
- REQUEST_DELAY_MIN=1
|
47 |
+
- REQUEST_DELAY_MAX=5
|
48 |
+
|
49 |
+
# Legal Sources Configuration
|
50 |
+
- CRAWLER_ENABLED=true
|
51 |
+
- MAX_CRAWL_DEPTH=2
|
52 |
+
- CRAWL_DELAY_SECONDS=3
|
53 |
+
|
54 |
+
volumes:
|
55 |
+
# Persistent data storage
|
56 |
+
- persian_legal_data:/app/data
|
57 |
+
- persian_legal_cache:/app/cache
|
58 |
+
- persian_legal_logs:/app/logs
|
59 |
+
- persian_legal_uploads:/app/uploads
|
60 |
+
|
61 |
+
# Configuration files (optional)
|
62 |
+
- ./config:/app/config:ro
|
63 |
+
|
64 |
+
networks:
|
65 |
+
- persian-legal-network
|
66 |
+
|
67 |
+
healthcheck:
|
68 |
+
test: ["CMD", "curl", "-f", "http://localhost:7860"]
|
69 |
+
interval: 30s
|
70 |
+
timeout: 10s
|
71 |
+
retries: 3
|
72 |
+
start_period: 60s
|
73 |
+
|
74 |
+
depends_on:
|
75 |
+
- redis-cache
|
76 |
+
|
77 |
+
# Resource limits for production
|
78 |
+
deploy:
|
79 |
+
resources:
|
80 |
+
limits:
|
81 |
+
memory: 4G
|
82 |
+
cpus: '2.0'
|
83 |
+
reservations:
|
84 |
+
memory: 2G
|
85 |
+
cpus: '1.0'
|
86 |
+
|
87 |
+
# Redis for Advanced Caching (Optional)
|
88 |
+
redis-cache:
|
89 |
+
image: redis:7-alpine
|
90 |
+
container_name: persian-legal-redis
|
91 |
+
restart: unless-stopped
|
92 |
+
command: redis-server --appendonly yes --maxmemory 256mb --maxmemory-policy allkeys-lru
|
93 |
+
volumes:
|
94 |
+
- persian_legal_redis_data:/data
|
95 |
+
networks:
|
96 |
+
- persian-legal-network
|
97 |
+
ports:
|
98 |
+
- "6379:6379"
|
99 |
+
healthcheck:
|
100 |
+
test: ["CMD", "redis-cli", "ping"]
|
101 |
+
interval: 10s
|
102 |
+
timeout: 3s
|
103 |
+
retries: 3
|
104 |
+
|
105 |
+
# Nginx Reverse Proxy (Optional for production)
|
106 |
+
nginx-proxy:
|
107 |
+
image: nginx:alpine
|
108 |
+
container_name: persian-legal-nginx
|
109 |
+
restart: unless-stopped
|
110 |
+
ports:
|
111 |
+
- "80:80"
|
112 |
+
- "443:443"
|
113 |
+
volumes:
|
114 |
+
- ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
|
115 |
+
- ./nginx/ssl:/etc/nginx/ssl:ro
|
116 |
+
- persian_legal_logs:/var/log/nginx
|
117 |
networks:
|
118 |
+
- persian-legal-network
|
119 |
+
depends_on:
|
120 |
+
- persian-legal-scraper
|
121 |
+
profiles:
|
122 |
+
- production
|
123 |
+
|
124 |
+
# Database Backup Service (Optional)
|
125 |
+
db-backup:
|
126 |
+
image: alpine:latest
|
127 |
+
container_name: persian-legal-backup
|
128 |
+
restart: unless-stopped
|
129 |
+
command: |
|
130 |
+
sh -c '
|
131 |
+
apk add --no-cache sqlite curl
|
132 |
+
while true; do
|
133 |
+
echo "Starting backup at $$(date)"
|
134 |
+
sqlite3 /app/data/iranian_legal_archive_advanced.sqlite ".backup /app/backups/backup_$$(date +%Y%m%d_%H%M%S).sqlite"
|
135 |
+
find /app/backups -name "backup_*.sqlite" -mtime +7 -delete
|
136 |
+
echo "Backup completed at $$(date)"
|
137 |
+
sleep 86400
|
138 |
+
done
|
139 |
+
'
|
140 |
volumes:
|
141 |
+
- persian_legal_data:/app/data:ro
|
142 |
+
- persian_legal_backups:/app/backups
|
143 |
+
networks:
|
144 |
+
- persian-legal-network
|
145 |
+
profiles:
|
146 |
+
- production
|
147 |
+
|
148 |
+
volumes:
|
149 |
+
# Persistent data volumes
|
150 |
+
persian_legal_data:
|
151 |
+
driver: local
|
152 |
+
driver_opts:
|
153 |
+
type: none
|
154 |
+
o: bind
|
155 |
+
device: ./data
|
156 |
+
|
157 |
+
persian_legal_cache:
|
158 |
+
driver: local
|
159 |
+
driver_opts:
|
160 |
+
type: none
|
161 |
+
o: bind
|
162 |
+
device: ./cache
|
163 |
+
|
164 |
+
persian_legal_logs:
|
165 |
+
driver: local
|
166 |
+
driver_opts:
|
167 |
+
type: none
|
168 |
+
o: bind
|
169 |
+
device: ./logs
|
170 |
+
|
171 |
+
persian_legal_uploads:
|
172 |
+
driver: local
|
173 |
+
|
174 |
+
persian_legal_redis_data:
|
175 |
+
driver: local
|
176 |
+
|
177 |
+
persian_legal_backups:
|
178 |
+
driver: local
|
179 |
+
|
180 |
+
networks:
|
181 |
+
persian-legal-network:
|
182 |
+
driver: bridge
|
183 |
+
ipam:
|
184 |
+
config:
|
185 |
+
- subnet: 172.20.0.0/16
|
dockerfile
CHANGED
@@ -1,34 +1,129 @@
|
|
1 |
-
# Dockerfile
|
2 |
-
|
3 |
|
4 |
-
|
|
|
5 |
|
6 |
-
#
|
7 |
RUN apt-get update && apt-get install -y \
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
9 |
curl \
|
10 |
-
&& rm -rf /var/lib/apt/lists/*
|
|
|
11 |
|
12 |
-
#
|
13 |
-
|
|
|
|
|
|
|
|
|
14 |
|
15 |
-
#
|
|
|
|
|
16 |
RUN pip install --no-cache-dir -r requirements.txt
|
17 |
|
18 |
-
#
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
-
#
|
22 |
-
|
23 |
|
24 |
-
#
|
25 |
-
ENV
|
26 |
-
ENV
|
27 |
-
ENV
|
|
|
|
|
28 |
ENV TOKENIZERS_PARALLELISM=false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
-
#
|
31 |
EXPOSE 7860
|
32 |
|
33 |
-
#
|
34 |
-
CMD ["python", "persian_legal_scraper.py"]
|
|
|
1 |
+
# Dockerfile for Advanced Iranian Legal Archive System
|
2 |
+
# Optimized for Hugging Face Spaces deployment
|
3 |
|
4 |
+
# Stage 1: Builder
|
5 |
+
FROM python:3.10-slim AS builder
|
6 |
|
7 |
+
# Install build dependencies and system packages
|
8 |
RUN apt-get update && apt-get install -y \
|
9 |
+
build-essential \
|
10 |
+
gcc \
|
11 |
+
g++ \
|
12 |
+
libffi-dev \
|
13 |
+
libssl-dev \
|
14 |
+
wget \
|
15 |
curl \
|
16 |
+
&& rm -rf /var/lib/apt/lists/* \
|
17 |
+
&& apt-get clean
|
18 |
|
19 |
+
# Upgrade pip and install build tools
|
20 |
+
RUN pip install --no-cache-dir --upgrade pip setuptools wheel
|
21 |
+
|
22 |
+
# Create virtual environment
|
23 |
+
RUN python -m venv /opt/venv
|
24 |
+
ENV PATH="/opt/venv/bin:$PATH"
|
25 |
|
26 |
+
# Copy requirements and install Python dependencies
|
27 |
+
WORKDIR /build
|
28 |
+
COPY requirements.txt .
|
29 |
RUN pip install --no-cache-dir -r requirements.txt
|
30 |
|
31 |
+
# Create cache directory for models
|
32 |
+
RUN mkdir -p /app/cache/transformers /app/cache/sentence-transformers
|
33 |
+
|
34 |
+
# Pre-download Persian BERT models (Primary classification model)
|
35 |
+
RUN python -c "from transformers import AutoModel, AutoTokenizer; \
|
36 |
+
print('Downloading ParsBERT...'); \
|
37 |
+
AutoModel.from_pretrained('HooshvareLab/bert-base-parsbert-uncased', cache_dir='/app/cache/transformers'); \
|
38 |
+
AutoTokenizer.from_pretrained('HooshvareLab/bert-base-parsbert-uncased', cache_dir='/app/cache/transformers')" || true
|
39 |
+
|
40 |
+
# Pre-download NER model for entity recognition
|
41 |
+
RUN python -c "from transformers import AutoModel, AutoTokenizer; \
|
42 |
+
print('Downloading Persian NER model...'); \
|
43 |
+
AutoModel.from_pretrained('HooshvareLab/bert-fa-base-uncased-ner', cache_dir='/app/cache/transformers'); \
|
44 |
+
AutoTokenizer.from_pretrained('HooshvareLab/bert-fa-base-uncased-ner', cache_dir='/app/cache/transformers')" || true
|
45 |
+
|
46 |
+
# Pre-download Persian embedding model for semantic search
|
47 |
+
RUN python -c "from sentence_transformers import SentenceTransformer; \
|
48 |
+
print('Downloading Persian embedding model...'); \
|
49 |
+
model = SentenceTransformer('xmanii/maux-gte-persian'); \
|
50 |
+
model.save('/app/cache/sentence-transformers/maux-gte-persian')" || true
|
51 |
+
|
52 |
+
# Pre-download multilingual sentence transformer as fallback
|
53 |
+
RUN python -c "from sentence_transformers import SentenceTransformer; \
|
54 |
+
print('Downloading multilingual model...'); \
|
55 |
+
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'); \
|
56 |
+
model.save('/app/cache/sentence-transformers/paraphrase-multilingual')" || true
|
57 |
+
|
58 |
+
# Try to download FaBERT (latest SOTA Persian model)
|
59 |
+
RUN python -c "from transformers import AutoModel, AutoTokenizer; \
|
60 |
+
print('Downloading FaBERT...'); \
|
61 |
+
AutoModel.from_pretrained('sbunlp/fabert', cache_dir='/app/cache/transformers'); \
|
62 |
+
AutoTokenizer.from_pretrained('sbunlp/fabert', cache_dir='/app/cache/transformers')" || echo "FaBERT download failed, will fallback to ParsBERT"
|
63 |
+
|
64 |
+
# Stage 2: Production
|
65 |
+
FROM python:3.10-slim
|
66 |
+
|
67 |
+
# Install runtime dependencies
|
68 |
+
RUN apt-get update && apt-get install -y \
|
69 |
+
sqlite3 \
|
70 |
+
libsqlite3-dev \
|
71 |
+
curl \
|
72 |
+
&& rm -rf /var/lib/apt/lists/* \
|
73 |
+
&& apt-get clean
|
74 |
+
|
75 |
+
# Create non-root user for security
|
76 |
+
RUN groupadd -g 1000 appuser && useradd -r -u 1000 -g appuser appuser
|
77 |
+
|
78 |
+
# Copy virtual environment from builder stage
|
79 |
+
COPY --from=builder /opt/venv /opt/venv
|
80 |
+
ENV PATH="/opt/venv/bin:$PATH"
|
81 |
+
|
82 |
+
# Copy pre-downloaded models
|
83 |
+
COPY --from=builder /app/cache /app/cache
|
84 |
+
|
85 |
+
# Create application directories
|
86 |
+
RUN mkdir -p /app/data /app/logs /app/uploads /app/tmp && \
|
87 |
+
chown -R appuser:appuser /app
|
88 |
+
|
89 |
+
# Set working directory
|
90 |
+
WORKDIR /app
|
91 |
|
92 |
+
# Copy application files
|
93 |
+
COPY --chown=appuser:appuser . .
|
94 |
|
95 |
+
# Environment variables for Iranian Legal Archive System
|
96 |
+
ENV PYTHONPATH=/app
|
97 |
+
ENV PYTHONUNBUFFERED=1
|
98 |
+
ENV HF_HOME=/app/cache
|
99 |
+
ENV TRANSFORMERS_CACHE=/app/cache/transformers
|
100 |
+
ENV TORCH_HOME=/app/cache/torch
|
101 |
ENV TOKENIZERS_PARALLELISM=false
|
102 |
+
ENV LOG_LEVEL=INFO
|
103 |
+
ENV ENVIRONMENT=production
|
104 |
+
|
105 |
+
# Gradio specific settings
|
106 |
+
ENV GRADIO_SERVER_NAME=0.0.0.0
|
107 |
+
ENV GRADIO_SERVER_PORT=7860
|
108 |
+
ENV GRADIO_SHARE=false
|
109 |
+
|
110 |
+
# Memory optimization for HF Spaces
|
111 |
+
ENV PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128
|
112 |
+
ENV OMP_NUM_THREADS=2
|
113 |
+
ENV MKL_NUM_THREADS=2
|
114 |
+
|
115 |
+
# Switch to non-root user
|
116 |
+
USER appuser
|
117 |
+
|
118 |
+
# Create data directories with proper permissions
|
119 |
+
RUN mkdir -p data/cache_system.sqlite data/iranian_legal_archive_advanced.sqlite data/embeddings_cache.pkl data/faiss_index.bin
|
120 |
+
|
121 |
+
# Health check
|
122 |
+
HEALTHCHECK --interval=30s --timeout=30s --start-period=60s --retries=3 \
|
123 |
+
CMD curl -f http://localhost:7860 || exit 1
|
124 |
|
125 |
+
# Expose Gradio port
|
126 |
EXPOSE 7860
|
127 |
|
128 |
+
# Command to run the application (matching README.md app_file)
|
129 |
+
CMD ["python", "persian_legal_scraper.py"]
|