Really-amin commited on
Commit
546331e
·
verified ·
1 Parent(s): 080041c

Upload 2 files

Browse files
Files changed (2) hide show
  1. docker-compose.yaml +175 -9
  2. dockerfile +115 -20
docker-compose.yaml CHANGED
@@ -1,19 +1,185 @@
1
- ```yaml
2
- version: "3.8"
3
 
4
  services:
5
- gradio:
 
6
  build:
7
  context: .
8
  dockerfile: Dockerfile
9
- container_name: legal_dashboard_gradio
10
  restart: unless-stopped
11
  ports:
12
  - "7860:7860"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  networks:
14
- - app_network
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  volumes:
16
- - ./data:/app/data:rw
17
- - ./cache:/app/cache:rw
18
- - ./logs:/app/logs:rw
19
- -
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
 
2
 
3
  services:
4
+ # Persian Legal Scraper Main Application
5
+ persian-legal-scraper:
6
  build:
7
  context: .
8
  dockerfile: Dockerfile
9
+ container_name: persian-legal-scraper-app
10
  restart: unless-stopped
11
  ports:
12
  - "7860:7860"
13
+ environment:
14
+ # Application Settings
15
+ - PYTHONPATH=/app
16
+ - PYTHONUNBUFFERED=1
17
+ - LOG_LEVEL=INFO
18
+ - ENVIRONMENT=production
19
+
20
+ # Gradio Configuration
21
+ - GRADIO_SERVER_NAME=0.0.0.0
22
+ - GRADIO_SERVER_PORT=7860
23
+ - GRADIO_SHARE=false
24
+ - GRADIO_THEME=default
25
+
26
+ # AI/ML Model Settings
27
+ - HF_HOME=/app/cache
28
+ - TRANSFORMERS_CACHE=/app/cache/transformers
29
+ - TORCH_HOME=/app/cache/torch
30
+ - TOKENIZERS_PARALLELISM=false
31
+
32
+ # Performance Optimization
33
+ - PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128
34
+ - OMP_NUM_THREADS=2
35
+ - MKL_NUM_THREADS=2
36
+
37
+ # Database Configuration
38
+ - DATABASE_PATH=/app/data/iranian_legal_archive_advanced.sqlite
39
+ - CACHE_DB_PATH=/app/data/cache_system.sqlite
40
+ - EMBEDDINGS_CACHE_PATH=/app/data/embeddings_cache.pkl
41
+ - VECTOR_INDEX_PATH=/app/data/faiss_index.bin
42
+
43
+ # Security Settings
44
+ - ANTI_DDOS_ENABLED=true
45
+ - MAX_REQUESTS_PER_HOUR=100
46
+ - REQUEST_DELAY_MIN=1
47
+ - REQUEST_DELAY_MAX=5
48
+
49
+ # Legal Sources Configuration
50
+ - CRAWLER_ENABLED=true
51
+ - MAX_CRAWL_DEPTH=2
52
+ - CRAWL_DELAY_SECONDS=3
53
+
54
+ volumes:
55
+ # Persistent data storage
56
+ - persian_legal_data:/app/data
57
+ - persian_legal_cache:/app/cache
58
+ - persian_legal_logs:/app/logs
59
+ - persian_legal_uploads:/app/uploads
60
+
61
+ # Configuration files (optional)
62
+ - ./config:/app/config:ro
63
+
64
+ networks:
65
+ - persian-legal-network
66
+
67
+ healthcheck:
68
+ test: ["CMD", "curl", "-f", "http://localhost:7860"]
69
+ interval: 30s
70
+ timeout: 10s
71
+ retries: 3
72
+ start_period: 60s
73
+
74
+ depends_on:
75
+ - redis-cache
76
+
77
+ # Resource limits for production
78
+ deploy:
79
+ resources:
80
+ limits:
81
+ memory: 4G
82
+ cpus: '2.0'
83
+ reservations:
84
+ memory: 2G
85
+ cpus: '1.0'
86
+
87
+ # Redis for Advanced Caching (Optional)
88
+ redis-cache:
89
+ image: redis:7-alpine
90
+ container_name: persian-legal-redis
91
+ restart: unless-stopped
92
+ command: redis-server --appendonly yes --maxmemory 256mb --maxmemory-policy allkeys-lru
93
+ volumes:
94
+ - persian_legal_redis_data:/data
95
+ networks:
96
+ - persian-legal-network
97
+ ports:
98
+ - "6379:6379"
99
+ healthcheck:
100
+ test: ["CMD", "redis-cli", "ping"]
101
+ interval: 10s
102
+ timeout: 3s
103
+ retries: 3
104
+
105
+ # Nginx Reverse Proxy (Optional for production)
106
+ nginx-proxy:
107
+ image: nginx:alpine
108
+ container_name: persian-legal-nginx
109
+ restart: unless-stopped
110
+ ports:
111
+ - "80:80"
112
+ - "443:443"
113
+ volumes:
114
+ - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
115
+ - ./nginx/ssl:/etc/nginx/ssl:ro
116
+ - persian_legal_logs:/var/log/nginx
117
  networks:
118
+ - persian-legal-network
119
+ depends_on:
120
+ - persian-legal-scraper
121
+ profiles:
122
+ - production
123
+
124
+ # Database Backup Service (Optional)
125
+ db-backup:
126
+ image: alpine:latest
127
+ container_name: persian-legal-backup
128
+ restart: unless-stopped
129
+ command: |
130
+ sh -c '
131
+ apk add --no-cache sqlite curl
132
+ while true; do
133
+ echo "Starting backup at $$(date)"
134
+ sqlite3 /app/data/iranian_legal_archive_advanced.sqlite ".backup /app/backups/backup_$$(date +%Y%m%d_%H%M%S).sqlite"
135
+ find /app/backups -name "backup_*.sqlite" -mtime +7 -delete
136
+ echo "Backup completed at $$(date)"
137
+ sleep 86400
138
+ done
139
+ '
140
  volumes:
141
+ - persian_legal_data:/app/data:ro
142
+ - persian_legal_backups:/app/backups
143
+ networks:
144
+ - persian-legal-network
145
+ profiles:
146
+ - production
147
+
148
+ volumes:
149
+ # Persistent data volumes
150
+ persian_legal_data:
151
+ driver: local
152
+ driver_opts:
153
+ type: none
154
+ o: bind
155
+ device: ./data
156
+
157
+ persian_legal_cache:
158
+ driver: local
159
+ driver_opts:
160
+ type: none
161
+ o: bind
162
+ device: ./cache
163
+
164
+ persian_legal_logs:
165
+ driver: local
166
+ driver_opts:
167
+ type: none
168
+ o: bind
169
+ device: ./logs
170
+
171
+ persian_legal_uploads:
172
+ driver: local
173
+
174
+ persian_legal_redis_data:
175
+ driver: local
176
+
177
+ persian_legal_backups:
178
+ driver: local
179
+
180
+ networks:
181
+ persian-legal-network:
182
+ driver: bridge
183
+ ipam:
184
+ config:
185
+ - subnet: 172.20.0.0/16
dockerfile CHANGED
@@ -1,34 +1,129 @@
1
- # Dockerfile
2
- FROM python:3.9-slim
3
 
4
- WORKDIR /app
 
5
 
6
- # نصب dependencies سیستمی
7
  RUN apt-get update && apt-get install -y \
8
- git \
 
 
 
 
 
9
  curl \
10
- && rm -rf /var/lib/apt/lists/*
 
11
 
12
- # کپی فایل requirements
13
- COPY requirements.txt .
 
 
 
 
14
 
15
- # نصب پکیج های پایتون
 
 
16
  RUN pip install --no-cache-dir -r requirements.txt
17
 
18
- # کپی فایل اصلی
19
- COPY persian_legal_scraper.py .
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
- # ایجاد دایرکتوری های کش
22
- RUN mkdir -p /tmp/hf_cache /tmp/torch_cache
23
 
24
- # تنظیم محیط
25
- ENV TRANSFORMERS_CACHE=/tmp/hf_cache
26
- ENV HF_HOME=/tmp/hf_cache
27
- ENV TORCH_HOME=/tmp/torch_cache
 
 
28
  ENV TOKENIZERS_PARALLELISM=false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
- # پورت اکسپوز
31
  EXPOSE 7860
32
 
33
- # اجرای برنامه
34
- CMD ["python", "persian_legal_scraper.py"]
 
1
+ # Dockerfile for Advanced Iranian Legal Archive System
2
+ # Optimized for Hugging Face Spaces deployment
3
 
4
+ # Stage 1: Builder
5
+ FROM python:3.10-slim AS builder
6
 
7
+ # Install build dependencies and system packages
8
  RUN apt-get update && apt-get install -y \
9
+ build-essential \
10
+ gcc \
11
+ g++ \
12
+ libffi-dev \
13
+ libssl-dev \
14
+ wget \
15
  curl \
16
+ && rm -rf /var/lib/apt/lists/* \
17
+ && apt-get clean
18
 
19
+ # Upgrade pip and install build tools
20
+ RUN pip install --no-cache-dir --upgrade pip setuptools wheel
21
+
22
+ # Create virtual environment
23
+ RUN python -m venv /opt/venv
24
+ ENV PATH="/opt/venv/bin:$PATH"
25
 
26
+ # Copy requirements and install Python dependencies
27
+ WORKDIR /build
28
+ COPY requirements.txt .
29
  RUN pip install --no-cache-dir -r requirements.txt
30
 
31
+ # Create cache directory for models
32
+ RUN mkdir -p /app/cache/transformers /app/cache/sentence-transformers
33
+
34
+ # Pre-download Persian BERT models (Primary classification model)
35
+ RUN python -c "from transformers import AutoModel, AutoTokenizer; \
36
+ print('Downloading ParsBERT...'); \
37
+ AutoModel.from_pretrained('HooshvareLab/bert-base-parsbert-uncased', cache_dir='/app/cache/transformers'); \
38
+ AutoTokenizer.from_pretrained('HooshvareLab/bert-base-parsbert-uncased', cache_dir='/app/cache/transformers')" || true
39
+
40
+ # Pre-download NER model for entity recognition
41
+ RUN python -c "from transformers import AutoModel, AutoTokenizer; \
42
+ print('Downloading Persian NER model...'); \
43
+ AutoModel.from_pretrained('HooshvareLab/bert-fa-base-uncased-ner', cache_dir='/app/cache/transformers'); \
44
+ AutoTokenizer.from_pretrained('HooshvareLab/bert-fa-base-uncased-ner', cache_dir='/app/cache/transformers')" || true
45
+
46
+ # Pre-download Persian embedding model for semantic search
47
+ RUN python -c "from sentence_transformers import SentenceTransformer; \
48
+ print('Downloading Persian embedding model...'); \
49
+ model = SentenceTransformer('xmanii/maux-gte-persian'); \
50
+ model.save('/app/cache/sentence-transformers/maux-gte-persian')" || true
51
+
52
+ # Pre-download multilingual sentence transformer as fallback
53
+ RUN python -c "from sentence_transformers import SentenceTransformer; \
54
+ print('Downloading multilingual model...'); \
55
+ model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'); \
56
+ model.save('/app/cache/sentence-transformers/paraphrase-multilingual')" || true
57
+
58
+ # Try to download FaBERT (latest SOTA Persian model)
59
+ RUN python -c "from transformers import AutoModel, AutoTokenizer; \
60
+ print('Downloading FaBERT...'); \
61
+ AutoModel.from_pretrained('sbunlp/fabert', cache_dir='/app/cache/transformers'); \
62
+ AutoTokenizer.from_pretrained('sbunlp/fabert', cache_dir='/app/cache/transformers')" || echo "FaBERT download failed, will fallback to ParsBERT"
63
+
64
+ # Stage 2: Production
65
+ FROM python:3.10-slim
66
+
67
+ # Install runtime dependencies
68
+ RUN apt-get update && apt-get install -y \
69
+ sqlite3 \
70
+ libsqlite3-dev \
71
+ curl \
72
+ && rm -rf /var/lib/apt/lists/* \
73
+ && apt-get clean
74
+
75
+ # Create non-root user for security
76
+ RUN groupadd -g 1000 appuser && useradd -r -u 1000 -g appuser appuser
77
+
78
+ # Copy virtual environment from builder stage
79
+ COPY --from=builder /opt/venv /opt/venv
80
+ ENV PATH="/opt/venv/bin:$PATH"
81
+
82
+ # Copy pre-downloaded models
83
+ COPY --from=builder /app/cache /app/cache
84
+
85
+ # Create application directories
86
+ RUN mkdir -p /app/data /app/logs /app/uploads /app/tmp && \
87
+ chown -R appuser:appuser /app
88
+
89
+ # Set working directory
90
+ WORKDIR /app
91
 
92
+ # Copy application files
93
+ COPY --chown=appuser:appuser . .
94
 
95
+ # Environment variables for Iranian Legal Archive System
96
+ ENV PYTHONPATH=/app
97
+ ENV PYTHONUNBUFFERED=1
98
+ ENV HF_HOME=/app/cache
99
+ ENV TRANSFORMERS_CACHE=/app/cache/transformers
100
+ ENV TORCH_HOME=/app/cache/torch
101
  ENV TOKENIZERS_PARALLELISM=false
102
+ ENV LOG_LEVEL=INFO
103
+ ENV ENVIRONMENT=production
104
+
105
+ # Gradio specific settings
106
+ ENV GRADIO_SERVER_NAME=0.0.0.0
107
+ ENV GRADIO_SERVER_PORT=7860
108
+ ENV GRADIO_SHARE=false
109
+
110
+ # Memory optimization for HF Spaces
111
+ ENV PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128
112
+ ENV OMP_NUM_THREADS=2
113
+ ENV MKL_NUM_THREADS=2
114
+
115
+ # Switch to non-root user
116
+ USER appuser
117
+
118
+ # Create data directories with proper permissions
119
+ RUN mkdir -p data/cache_system.sqlite data/iranian_legal_archive_advanced.sqlite data/embeddings_cache.pkl data/faiss_index.bin
120
+
121
+ # Health check
122
+ HEALTHCHECK --interval=30s --timeout=30s --start-period=60s --retries=3 \
123
+ CMD curl -f http://localhost:7860 || exit 1
124
 
125
+ # Expose Gradio port
126
  EXPOSE 7860
127
 
128
+ # Command to run the application (matching README.md app_file)
129
+ CMD ["python", "persian_legal_scraper.py"]