Really-amin commited on
Commit
150e5be
·
verified ·
1 Parent(s): d2f2b8a

Upload 6 files

Browse files
Files changed (6) hide show
  1. app.py +10 -3
  2. docker-compose.yaml +2 -54
  3. dockerfile +82 -67
  4. enhanced_legal_scraper.py +19 -6
  5. main.py +16 -10
  6. requirements.txt +18 -16
app.py CHANGED
@@ -1,16 +1,22 @@
 
1
  import gradio as gr
2
  import logging
3
  import requests
 
4
  from datetime import datetime
5
  from typing import Dict, List, Optional, Tuple
6
  from enhanced_legal_scraper import EnhancedLegalScraper, LegalDocument, IRANIAN_LEGAL_SOURCES
7
 
 
 
 
 
8
  # Configure logging
9
  logging.basicConfig(
10
  level=logging.INFO,
11
  format='%(asctime)s - %(levelname)s - %(message)s',
12
  handlers=[
13
- logging.FileHandler('/app/logs/legal_scraper.log'),
14
  logging.StreamHandler()
15
  ]
16
  )
@@ -203,7 +209,7 @@ class LegalScraperInterface:
203
  def export_data(self, export_format: str) -> Tuple[str, Optional[gr.File]]:
204
  try:
205
  timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
206
- filename = f"legal_documents_{timestamp}.{export_format.lower()}"
207
  if export_format == "CSV":
208
  result = self.scraper.export_to_csv(filename)
209
  if result:
@@ -584,4 +590,5 @@ if __name__ == "__main__":
584
  show_error=True,
585
  show_tips=True,
586
  enable_queue=True
587
- )
 
 
1
+ ```python
2
  import gradio as gr
3
  import logging
4
  import requests
5
+ import os
6
  from datetime import datetime
7
  from typing import Dict, List, Optional, Tuple
8
  from enhanced_legal_scraper import EnhancedLegalScraper, LegalDocument, IRANIAN_LEGAL_SOURCES
9
 
10
+ # Create log directory
11
+ log_dir = '/app/logs'
12
+ os.makedirs(log_dir, exist_ok=True)
13
+
14
  # Configure logging
15
  logging.basicConfig(
16
  level=logging.INFO,
17
  format='%(asctime)s - %(levelname)s - %(message)s',
18
  handlers=[
19
+ logging.FileHandler(os.path.join(log_dir, 'legal_scraper.log')),
20
  logging.StreamHandler()
21
  ]
22
  )
 
209
  def export_data(self, export_format: str) -> Tuple[str, Optional[gr.File]]:
210
  try:
211
  timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
212
+ filename = f"/app/data/legal_documents_{timestamp}.{export_format.lower()}"
213
  if export_format == "CSV":
214
  result = self.scraper.export_to_csv(filename)
215
  if result:
 
590
  show_error=True,
591
  show_tips=True,
592
  enable_queue=True
593
+ )
594
+ ```
docker-compose.yaml CHANGED
@@ -1,3 +1,4 @@
 
1
  version: "3.8"
2
 
3
  services:
@@ -15,57 +16,4 @@ services:
15
  - ./data:/app/data:rw
16
  - ./cache:/app/cache:rw
17
  - ./logs:/app/logs:rw
18
- - ./uploads:/app/uploads:rw
19
- environment:
20
- - DATABASE_DIR=/app/data
21
- - DATABASE_PATH=/app/data/legal_scraper.db
22
- - TRANSFORMERS_CACHE=/app/cache
23
- - HF_HOME=/app/cache
24
- - LOG_LEVEL=INFO
25
- - ENVIRONMENT=production
26
- - PYTHONPATH=/app
27
- - PYTHONUNBUFFERED=1
28
- - APP_MODE=gradio
29
- - API_BASE_URL=http://fastapi:8000
30
- depends_on:
31
- fastapi:
32
- condition: service_healthy
33
-
34
- fastapi:
35
- build:
36
- context: .
37
- dockerfile: Dockerfile
38
- container_name: legal_dashboard_fastapi
39
- restart: unless-stopped
40
- ports:
41
- - "8000:8000"
42
- networks:
43
- - app_network
44
- volumes:
45
- - ./data:/app/data:rw
46
- - ./cache:/app/cache:rw
47
- - ./logs:/app/logs:rw
48
- - ./uploads:/app/uploads:rw
49
- environment:
50
- - DATABASE_DIR=/app/data
51
- - DATABASE_PATH=/app/data/legal_scraper.db
52
- - TRANSFORMERS_CACHE=/app/cache
53
- - HF_HOME=/app/cache
54
- - LOG_LEVEL=INFO
55
- - ENVIRONMENT=production
56
- - PYTHONPATH=/app
57
- - PYTHONUNBUFFERED=1
58
- - APP_MODE=fastapi
59
- healthcheck:
60
- test: ["CMD-SHELL", "curl -fs http://localhost:8000/health || exit 1"]
61
- interval: 45s
62
- timeout: 30s
63
- retries: 10
64
- start_period: 180s
65
-
66
- redis:
67
- image: redis:7-alpine
68
- container_name: legal_dashboard_redis
69
- restart: unless-stopped
70
- networks:
71
- - app_network
 
1
+ ```yaml
2
  version: "3.8"
3
 
4
  services:
 
16
  - ./data:/app/data:rw
17
  - ./cache:/app/cache:rw
18
  - ./logs:/app/logs:rw
19
+ -
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dockerfile CHANGED
@@ -1,67 +1,82 @@
1
- ```
2
- # Stage 1: Builder
3
- FROM python:3.10-slim AS builder
4
-
5
- # Install build dependencies
6
- RUN apt-get update && apt-get install -y \
7
- build-essential \
8
- gcc \
9
- g++ \
10
- libffi-dev \
11
- libssl-dev \
12
- && rm -rf /var/lib/apt/lists/*
13
-
14
- # Upgrade pip and install wheel
15
- RUN pip install --upgrade pip setuptools wheel
16
-
17
- # Create virtual environment
18
- RUN python -m venv /opt/venv
19
- ENV PATH="/opt/venv/bin:$PATH"
20
-
21
- # Copy requirements and install dependencies
22
- WORKDIR /build
23
- COPY requirements.txt .
24
- RUN pip install --no-cache-dir -r requirements.txt
25
-
26
- # Stage 2: Production
27
- FROM python:3.10-slim
28
-
29
- # Install runtime dependencies
30
- RUN apt-get update && apt-get install -y \
31
- sqlite3 \
32
- && rm -rf /var/lib/apt/lists/* \
33
- && apt-get clean
34
-
35
- # Create non-root user
36
- RUN groupadd -g 1000 appuser && useradd -r -u 1000 -g appuser appuser
37
-
38
- # Copy virtual environment from builder
39
- COPY --from=builder /opt/venv /opt/venv
40
- ENV PATH="/opt/venv/bin:$PATH"
41
-
42
- # Set working directory
43
- WORKDIR /app
44
-
45
- # Copy all files
46
- COPY --chown=appuser:appuser . .
47
-
48
- # Environment variables
49
- ENV PYTHONPATH=/app
50
- ENV TRANSFORMERS_CACHE=/app/cache
51
- ENV HF_HOME=/app/cache
52
- ENV LOG_LEVEL=INFO
53
- ENV ENVIRONMENT=production
54
- ENV PYTHONUNBUFFERED=1
55
- ENV API_BASE_URL=http://localhost:8000
56
- ENV APP_MODE=gradio
57
-
58
- # Switch to non-root user
59
- USER appuser
60
-
61
- # Expose ports for Gradio and FastAPI
62
- EXPOSE 7860
63
- EXPOSE 8000
64
-
65
- # Start application via run.py
66
- CMD ["python", "run.py"]
67
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ```dockerfile
2
+ # Stage 1: Builder
3
+ FROM python:3.10-slim AS builder
4
+
5
+ # Install build dependencies
6
+ RUN apt-get update && apt-get install -y \
7
+ build-essential \
8
+ gcc \
9
+ g++ \
10
+ libffi-dev \
11
+ libssl-dev \
12
+ && rm -rf /var/lib/apt/lists/*
13
+
14
+ # Upgrade pip and install wheel
15
+ RUN pip install --upgrade pip setuptools wheel
16
+
17
+ # Create virtual environment
18
+ RUN python -m venv /opt/venv
19
+ ENV PATH="/opt/venv/bin:$PATH"
20
+
21
+ # Copy requirements and install dependencies
22
+ WORKDIR /build
23
+ COPY requirements.txt .
24
+ RUN pip install --no-cache-dir -r requirements.txt
25
+
26
+ # Pre-download transformer models
27
+ RUN mkdir -p /app/cache && \
28
+ python -c "from transformers import AutoModel, AutoTokenizer; \
29
+ AutoModel.from_pretrained('HooshvareLab/bert-fa-base-uncased', cache_dir='/app/cache'); \
30
+ AutoTokenizer.from_pretrained('HooshvareLab/bert-fa-base-uncased', cache_dir='/app/cache')" || true
31
+ RUN python -c "from transformers import TrOCRProcessor, VisionEncoderDecoderModel; \
32
+ TrOCRProcessor.from_pretrained('microsoft/trocr-base-printed', cache_dir='/app/cache'); \
33
+ VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-printed', cache_dir='/app/cache')" || true
34
+
35
+ # Stage 2: Production
36
+ FROM python:3.10-slim
37
+
38
+ # Install runtime dependencies
39
+ RUN apt-get update && apt-get install -y \
40
+ sqlite3 \
41
+ && rm -rf /var/lib/apt/lists/* \
42
+ && apt-get clean
43
+
44
+ # Create non-root user
45
+ RUN groupadd -g 1000 appuser && useradd -r -u 1000 -g appuser appuser
46
+
47
+ # Copy virtual environment from builder
48
+ COPY --from=builder /opt/venv /opt/venv
49
+ ENV PATH="/opt/venv/bin:$PATH"
50
+
51
+ # Copy cached models
52
+ COPY --from=builder /app/cache /app/cache
53
+
54
+ # Create required directories
55
+ RUN mkdir -p /app/data /app/logs /app/uploads && \
56
+ chown -R appuser:appuser /app/data /app/logs /app/uploads /app/cache
57
+
58
+ # Set working directory
59
+ WORKDIR /app
60
+
61
+ # Copy all files
62
+ COPY --chown=appuser:appuser . .
63
+
64
+ # Environment variables
65
+ ENV PYTHONPATH=/app
66
+ ENV HF_HOME=/app/cache
67
+ ENV LOG_LEVEL=INFO
68
+ ENV ENVIRONMENT=production
69
+ ENV PYTHONUNBUFFERED=1
70
+ ENV API_BASE_URL=http://localhost:8000
71
+ ENV APP_MODE=gradio
72
+
73
+ # Switch to non-root user
74
+ USER appuser
75
+
76
+ # Expose ports for Gradio and FastAPI
77
+ EXPOSE 7860
78
+ EXPOSE 8000
79
+
80
+ # Start application via run.py
81
+ CMD ["python", "run.py"]
82
+ ```
enhanced_legal_scraper.py CHANGED
@@ -1,9 +1,11 @@
 
1
  import requests
2
  import time
3
  import json
4
  import csv
5
  import sqlite3
6
  import logging
 
7
  from datetime import datetime, timedelta
8
  from typing import Dict, List, Optional, Tuple
9
  from urllib.parse import urljoin, urlparse
@@ -24,12 +26,16 @@ except ImportError as e:
24
  NLP_AVAILABLE = False
25
  logging.warning(f"⚠️ NLP libraries not available: {e}")
26
 
 
 
 
 
27
  # Configure logging
28
  logging.basicConfig(
29
  level=logging.INFO,
30
  format='%(asctime)s - %(levelname)s - %(message)s',
31
  handlers=[
32
- logging.FileHandler('/app/logs/legal_scraper.log'),
33
  logging.StreamHandler()
34
  ]
35
  )
@@ -85,14 +91,18 @@ class PersianNLPProcessor:
85
  self.model_tokenizer = None
86
  if NLP_AVAILABLE:
87
  try:
 
88
  self.normalizer = Normalizer()
89
  self.tokenizer = WordTokenizer()
90
  self.sentence_tokenizer = SentenceTokenizer()
91
- self.model = AutoModel.from_pretrained("HooshvareLab/bert-fa-base-uncased")
92
- self.model_tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-fa-base-uncased")
 
 
93
  except Exception as e:
94
- logger.error(f"Failed to initialize NLP components: {e}")
95
  self.model = None
 
96
 
97
  def normalize_text(self, text: str) -> str:
98
  if self.normalizer:
@@ -185,6 +195,8 @@ class PersianNLPProcessor:
185
 
186
  class EnhancedLegalScraper:
187
  def __init__(self, delay: float = 2.0, db_path: str = "/app/data/legal_scraper.db"):
 
 
188
  self.nlp = PersianNLPProcessor() if NLP_AVAILABLE else None
189
  self.session = requests.Session()
190
  self.delay = delay
@@ -212,7 +224,7 @@ class EnhancedLegalScraper:
212
  id INTEGER PRIMARY KEY AUTOINCREMENT,
213
  title TEXT NOT NULL,
214
  content TEXT NOT NULL,
215
- source_url TEXT UNIQUE NOT NULL,
216
  document_type TEXT NOT NULL,
217
  date_published TEXT,
218
  date_scraped TEXT NOT NULL,
@@ -615,4 +627,5 @@ class EnhancedLegalScraper:
615
  return stats
616
  except Exception as e:
617
  logger.error(f"Statistics failed: {e}")
618
- return {}
 
 
1
+ ```python
2
  import requests
3
  import time
4
  import json
5
  import csv
6
  import sqlite3
7
  import logging
8
+ import os
9
  from datetime import datetime, timedelta
10
  from typing import Dict, List, Optional, Tuple
11
  from urllib.parse import urljoin, urlparse
 
26
  NLP_AVAILABLE = False
27
  logging.warning(f"⚠️ NLP libraries not available: {e}")
28
 
29
+ # Create log directory
30
+ log_dir = '/app/logs'
31
+ os.makedirs(log_dir, exist_ok=True)
32
+
33
  # Configure logging
34
  logging.basicConfig(
35
  level=logging.INFO,
36
  format='%(asctime)s - %(levelname)s - %(message)s',
37
  handlers=[
38
+ logging.FileHandler(os.path.join(log_dir, 'legal_scraper.log')),
39
  logging.StreamHandler()
40
  ]
41
  )
 
91
  self.model_tokenizer = None
92
  if NLP_AVAILABLE:
93
  try:
94
+ logger.info("Initializing Persian NLP components...")
95
  self.normalizer = Normalizer()
96
  self.tokenizer = WordTokenizer()
97
  self.sentence_tokenizer = SentenceTokenizer()
98
+ if os.getenv("ENVIRONMENT") != "huggingface_free":
99
+ self.model = AutoModel.from_pretrained("HooshvareLab/bert-fa-base-uncased", cache_dir="/app/cache")
100
+ self.model_tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-fa-base-uncased", cache_dir="/app/cache")
101
+ logger.info("Persian NLP components initialized")
102
  except Exception as e:
103
+ logger.warning(f"Failed to initialize NLP components: {e}. Falling back to basic text processing.")
104
  self.model = None
105
+ self.model_tokenizer = None
106
 
107
  def normalize_text(self, text: str) -> str:
108
  if self.normalizer:
 
195
 
196
  class EnhancedLegalScraper:
197
  def __init__(self, delay: float = 2.0, db_path: str = "/app/data/legal_scraper.db"):
198
+ # Create data directory
199
+ os.makedirs('/app/data', exist_ok=True)
200
  self.nlp = PersianNLPProcessor() if NLP_AVAILABLE else None
201
  self.session = requests.Session()
202
  self.delay = delay
 
224
  id INTEGER PRIMARY KEY AUTOINCREMENT,
225
  title TEXT NOT NULL,
226
  content TEXT NOT NULL,
227
+ source_url TEXT UNIQUE NOT NOT NULL,
228
  document_type TEXT NOT NULL,
229
  date_published TEXT,
230
  date_scraped TEXT NOT NULL,
 
627
  return stats
628
  except Exception as e:
629
  logger.error(f"Statistics failed: {e}")
630
+ return {}
631
+ ```
main.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  import tempfile
3
  import logging
@@ -29,12 +30,16 @@ except ImportError as e:
29
  ML_AVAILABLE = False
30
  logger.warning(f"⚠️ ML libraries not available: {e}")
31
 
 
 
 
 
32
  # Configure logging
33
  logging.basicConfig(
34
  level=os.getenv("LOG_LEVEL", "INFO").upper(),
35
  format='%(asctime)s - %(levelname)s - %(message)s',
36
  handlers=[
37
- logging.FileHandler('/app/logs/legal_dashboard.log'),
38
  logging.StreamHandler()
39
  ]
40
  )
@@ -66,19 +71,19 @@ class OCRService:
66
  self.model = None
67
  self.processor = None
68
  self.model_loaded = False
 
 
69
 
70
- async def _load_model_async(self):
71
- if not ML_AVAILABLE:
72
- return
73
  try:
74
  logger.info("Loading TrOCR model...")
75
  model_name = "microsoft/trocr-base-printed"
76
- self.processor = TrOCRProcessor.from_pretrained(model_name)
77
- self.model = VisionEncoderDecoderModel.from_pretrained(model_name)
78
  self.model_loaded = True
79
  logger.info("✅ TrOCR model loaded successfully")
80
  except Exception as e:
81
- logger.error(f"❌ Failed to load TrOCR model: {e}")
82
  self.model_loaded = False
83
 
84
  async def extract_text_from_pdf(self, file_path: str) -> OCRResponse:
@@ -163,8 +168,8 @@ legal_api = LegalDashboardAPI()
163
 
164
  @app.on_event("startup")
165
  async def startup_event():
166
- if ML_AVAILABLE:
167
- await legal_api.ocr_service._load_model_async()
168
 
169
  @app.get("/health")
170
  async def health_check():
@@ -299,4 +304,5 @@ async def global_exception_handler(request: Request, exc: Exception):
299
 
300
  if __name__ == "__main__":
301
  import uvicorn
302
- uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=False, log_level="info")
 
 
1
+ ```python
2
  import os
3
  import tempfile
4
  import logging
 
30
  ML_AVAILABLE = False
31
  logger.warning(f"⚠️ ML libraries not available: {e}")
32
 
33
+ # Create log directory
34
+ log_dir = '/app/logs'
35
+ os.makedirs(log_dir, exist_ok=True)
36
+
37
  # Configure logging
38
  logging.basicConfig(
39
  level=os.getenv("LOG_LEVEL", "INFO").upper(),
40
  format='%(asctime)s - %(levelname)s - %(message)s',
41
  handlers=[
42
+ logging.FileHandler(os.path.join(log_dir, 'legal_dashboard.log')),
43
  logging.StreamHandler()
44
  ]
45
  )
 
71
  self.model = None
72
  self.processor = None
73
  self.model_loaded = False
74
+ if ML_AVAILABLE and os.getenv("ENVIRONMENT") != "huggingface_free":
75
+ self._load_model()
76
 
77
+ def _load_model(self):
 
 
78
  try:
79
  logger.info("Loading TrOCR model...")
80
  model_name = "microsoft/trocr-base-printed"
81
+ self.processor = TrOCRProcessor.from_pretrained(model_name, cache_dir="/app/cache")
82
+ self.model = VisionEncoderDecoderModel.from_pretrained(model_name, cache_dir="/app/cache")
83
  self.model_loaded = True
84
  logger.info("✅ TrOCR model loaded successfully")
85
  except Exception as e:
86
+ logger.warning(f"❌ Failed to load TrOCR model: {e}. OCR will use basic processing.")
87
  self.model_loaded = False
88
 
89
  async def extract_text_from_pdf(self, file_path: str) -> OCRResponse:
 
168
 
169
  @app.on_event("startup")
170
  async def startup_event():
171
+ if ML_AVAILABLE and os.getenv("ENVIRONMENT") != "huggingface_free":
172
+ legal_api.ocr_service._load_model()
173
 
174
  @app.get("/health")
175
  async def health_check():
 
304
 
305
  if __name__ == "__main__":
306
  import uvicorn
307
+ uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=False, log_level="info")
308
+ ```
requirements.txt CHANGED
@@ -1,16 +1,18 @@
1
- requests>=2.28.0
2
- beautifulsoup4>=4.11.0
3
- pandas>=2.0.0,<3.0.0
4
- scikit-learn>=1.4.0
5
- transformers>=4.39.0
6
- torch>=2.0.0
7
- hazm==0.10.0
8
- numpy==1.24.3
9
- gradio>=4.0.0
10
- fastapi>=0.95.0
11
- uvicorn>=0.20.0
12
- python-multipart>=0.0.6
13
- pillow>=9.0.0
14
- pymupdf>=1.21.0
15
- python-dotenv>=0.21.0
16
- plotly>=5.0.0
 
 
 
1
+ ```
2
+ requests>=2.28.0
3
+ beautifulsoup4>=4.11.0
4
+ pandas>=2.0.0,<3.0.0
5
+ scikit-learn>=1.4.0
6
+ transformers>=4.39.0
7
+ torch>=2.0.0
8
+ hazm==0.10.0
9
+ numpy==1.24.3
10
+ gradio>=4.0.0
11
+ fastapi>=0.95.0
12
+ uvicorn>=0.20.0
13
+ python-multipart>=0.0.6
14
+ pillow>=9.0.0
15
+ pymupdf>=1.21.0
16
+ python-dotenv>=0.21.0
17
+ plotly>=5.0.0
18
+ ```