Spaces:
Paused
Paused
Upload 4 files
Browse files- dockerfile +1 -1
- enhanced_legal_scraper.py +7 -5
- main.py +1 -1
- requirements.txt +3 -1
dockerfile
CHANGED
@@ -29,7 +29,7 @@ RUN mkdir -p /app/cache && \
|
|
29 |
AutoModel.from_pretrained('HooshvareLab/bert-fa-base-uncased', cache_dir='/app/cache'); \
|
30 |
AutoTokenizer.from_pretrained('HooshvareLab/bert-fa-base-uncased', cache_dir='/app/cache')" || true
|
31 |
RUN python -c "from transformers import TrOCRProcessor, VisionEncoderDecoderModel; \
|
32 |
-
TrOCRProcessor.from_pretrained('microsoft/trocr-base-printed', cache_dir='/app/cache'); \
|
33 |
VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-printed', cache_dir='/app/cache')" || true
|
34 |
|
35 |
# Stage 2: Production
|
|
|
29 |
AutoModel.from_pretrained('HooshvareLab/bert-fa-base-uncased', cache_dir='/app/cache'); \
|
30 |
AutoTokenizer.from_pretrained('HooshvareLab/bert-fa-base-uncased', cache_dir='/app/cache')" || true
|
31 |
RUN python -c "from transformers import TrOCRProcessor, VisionEncoderDecoderModel; \
|
32 |
+
TrOCRProcessor.from_pretrained('microsoft/trocr-base-printed', cache_dir='/app/cache', use_fast=True); \
|
33 |
VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-printed', cache_dir='/app/cache')" || true
|
34 |
|
35 |
# Stage 2: Production
|
enhanced_legal_scraper.py
CHANGED
@@ -26,9 +26,13 @@ except ImportError as e:
|
|
26 |
NLP_AVAILABLE = False
|
27 |
logging.warning(f"⚠️ NLP libraries not available: {e}")
|
28 |
|
29 |
-
# Create
|
30 |
log_dir = '/app/logs'
|
|
|
|
|
31 |
os.makedirs(log_dir, exist_ok=True)
|
|
|
|
|
32 |
|
33 |
# Configure logging
|
34 |
logging.basicConfig(
|
@@ -195,8 +199,6 @@ class PersianNLPProcessor:
|
|
195 |
|
196 |
class EnhancedLegalScraper:
|
197 |
def __init__(self, delay: float = 2.0, db_path: str = "/app/data/legal_scraper.db"):
|
198 |
-
# Create data directory
|
199 |
-
os.makedirs('/app/data', exist_ok=True)
|
200 |
self.nlp = PersianNLPProcessor() if NLP_AVAILABLE else None
|
201 |
self.session = requests.Session()
|
202 |
self.delay = delay
|
@@ -224,7 +226,7 @@ class EnhancedLegalScraper:
|
|
224 |
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
225 |
title TEXT NOT NULL,
|
226 |
content TEXT NOT NULL,
|
227 |
-
source_url TEXT UNIQUE NOT
|
228 |
document_type TEXT NOT NULL,
|
229 |
date_published TEXT,
|
230 |
date_scraped TEXT NOT NULL,
|
@@ -358,7 +360,7 @@ class EnhancedLegalScraper:
|
|
358 |
score += 0.2
|
359 |
return min(score, 1.0)
|
360 |
|
361 |
-
def scrape_real_sources(self, source_urls: List[str] = None, max_docs: int =
|
362 |
if not source_urls:
|
363 |
source_urls = IRANIAN_LEGAL_SOURCES
|
364 |
documents = []
|
|
|
26 |
NLP_AVAILABLE = False
|
27 |
logging.warning(f"⚠️ NLP libraries not available: {e}")
|
28 |
|
29 |
+
# Create required directories
|
30 |
log_dir = '/app/logs'
|
31 |
+
data_dir = '/app/data'
|
32 |
+
cache_dir = '/app/cache'
|
33 |
os.makedirs(log_dir, exist_ok=True)
|
34 |
+
os.makedirs(data_dir, exist_ok=True)
|
35 |
+
os.makedirs(cache_dir, exist_ok=True)
|
36 |
|
37 |
# Configure logging
|
38 |
logging.basicConfig(
|
|
|
199 |
|
200 |
class EnhancedLegalScraper:
|
201 |
def __init__(self, delay: float = 2.0, db_path: str = "/app/data/legal_scraper.db"):
|
|
|
|
|
202 |
self.nlp = PersianNLPProcessor() if NLP_AVAILABLE else None
|
203 |
self.session = requests.Session()
|
204 |
self.delay = delay
|
|
|
226 |
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
227 |
title TEXT NOT NULL,
|
228 |
content TEXT NOT NULL,
|
229 |
+
source_url TEXT UNIQUE NOT NULL,
|
230 |
document_type TEXT NOT NULL,
|
231 |
date_published TEXT,
|
232 |
date_scraped TEXT NOT NULL,
|
|
|
360 |
score += 0.2
|
361 |
return min(score, 1.0)
|
362 |
|
363 |
+
def scrape_real_sources(self, source_urls: List[str] = None, max_docs: int = 10) -> List[LegalDocument]:
|
364 |
if not source_urls:
|
365 |
source_urls = IRANIAN_LEGAL_SOURCES
|
366 |
documents = []
|
main.py
CHANGED
@@ -78,7 +78,7 @@ class OCRService:
|
|
78 |
try:
|
79 |
logger.info("Loading TrOCR model...")
|
80 |
model_name = "microsoft/trocr-base-printed"
|
81 |
-
self.processor = TrOCRProcessor.from_pretrained(model_name, cache_dir="/app/cache")
|
82 |
self.model = VisionEncoderDecoderModel.from_pretrained(model_name, cache_dir="/app/cache")
|
83 |
self.model_loaded = True
|
84 |
logger.info("✅ TrOCR model loaded successfully")
|
|
|
78 |
try:
|
79 |
logger.info("Loading TrOCR model...")
|
80 |
model_name = "microsoft/trocr-base-printed"
|
81 |
+
self.processor = TrOCRProcessor.from_pretrained(model_name, cache_dir="/app/cache", use_fast=True)
|
82 |
self.model = VisionEncoderDecoderModel.from_pretrained(model_name, cache_dir="/app/cache")
|
83 |
self.model_loaded = True
|
84 |
logger.info("✅ TrOCR model loaded successfully")
|
requirements.txt
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
requests>=2.28.0
|
2 |
beautifulsoup4>=4.11.0
|
3 |
pandas>=2.0.0,<3.0.0
|
@@ -13,4 +14,5 @@ python-multipart>=0.0.6
|
|
13 |
pillow>=9.0.0
|
14 |
pymupdf>=1.21.0
|
15 |
python-dotenv>=0.21.0
|
16 |
-
plotly>=5.0.0
|
|
|
|
1 |
+
```
|
2 |
requests>=2.28.0
|
3 |
beautifulsoup4>=4.11.0
|
4 |
pandas>=2.0.0,<3.0.0
|
|
|
14 |
pillow>=9.0.0
|
15 |
pymupdf>=1.21.0
|
16 |
python-dotenv>=0.21.0
|
17 |
+
plotly>=5.0.0
|
18 |
+
```
|