Really-amin commited on
Commit
8acdbd0
·
verified ·
1 Parent(s): 91b299f

Upload 4 files

Browse files
Files changed (4) hide show
  1. dockerfile +1 -1
  2. enhanced_legal_scraper.py +7 -5
  3. main.py +1 -1
  4. requirements.txt +3 -1
dockerfile CHANGED
@@ -29,7 +29,7 @@ RUN mkdir -p /app/cache && \
29
  AutoModel.from_pretrained('HooshvareLab/bert-fa-base-uncased', cache_dir='/app/cache'); \
30
  AutoTokenizer.from_pretrained('HooshvareLab/bert-fa-base-uncased', cache_dir='/app/cache')" || true
31
  RUN python -c "from transformers import TrOCRProcessor, VisionEncoderDecoderModel; \
32
- TrOCRProcessor.from_pretrained('microsoft/trocr-base-printed', cache_dir='/app/cache'); \
33
  VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-printed', cache_dir='/app/cache')" || true
34
 
35
  # Stage 2: Production
 
29
  AutoModel.from_pretrained('HooshvareLab/bert-fa-base-uncased', cache_dir='/app/cache'); \
30
  AutoTokenizer.from_pretrained('HooshvareLab/bert-fa-base-uncased', cache_dir='/app/cache')" || true
31
  RUN python -c "from transformers import TrOCRProcessor, VisionEncoderDecoderModel; \
32
+ TrOCRProcessor.from_pretrained('microsoft/trocr-base-printed', cache_dir='/app/cache', use_fast=True); \
33
  VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-printed', cache_dir='/app/cache')" || true
34
 
35
  # Stage 2: Production
enhanced_legal_scraper.py CHANGED
@@ -26,9 +26,13 @@ except ImportError as e:
26
  NLP_AVAILABLE = False
27
  logging.warning(f"⚠️ NLP libraries not available: {e}")
28
 
29
- # Create log directory
30
  log_dir = '/app/logs'
 
 
31
  os.makedirs(log_dir, exist_ok=True)
 
 
32
 
33
  # Configure logging
34
  logging.basicConfig(
@@ -195,8 +199,6 @@ class PersianNLPProcessor:
195
 
196
  class EnhancedLegalScraper:
197
  def __init__(self, delay: float = 2.0, db_path: str = "/app/data/legal_scraper.db"):
198
- # Create data directory
199
- os.makedirs('/app/data', exist_ok=True)
200
  self.nlp = PersianNLPProcessor() if NLP_AVAILABLE else None
201
  self.session = requests.Session()
202
  self.delay = delay
@@ -224,7 +226,7 @@ class EnhancedLegalScraper:
224
  id INTEGER PRIMARY KEY AUTOINCREMENT,
225
  title TEXT NOT NULL,
226
  content TEXT NOT NULL,
227
- source_url TEXT UNIQUE NOT NOT NULL,
228
  document_type TEXT NOT NULL,
229
  date_published TEXT,
230
  date_scraped TEXT NOT NULL,
@@ -358,7 +360,7 @@ class EnhancedLegalScraper:
358
  score += 0.2
359
  return min(score, 1.0)
360
 
361
- def scrape_real_sources(self, source_urls: List[str] = None, max_docs: int = 20) -> List[LegalDocument]:
362
  if not source_urls:
363
  source_urls = IRANIAN_LEGAL_SOURCES
364
  documents = []
 
26
  NLP_AVAILABLE = False
27
  logging.warning(f"⚠️ NLP libraries not available: {e}")
28
 
29
+ # Create required directories
30
  log_dir = '/app/logs'
31
+ data_dir = '/app/data'
32
+ cache_dir = '/app/cache'
33
  os.makedirs(log_dir, exist_ok=True)
34
+ os.makedirs(data_dir, exist_ok=True)
35
+ os.makedirs(cache_dir, exist_ok=True)
36
 
37
  # Configure logging
38
  logging.basicConfig(
 
199
 
200
  class EnhancedLegalScraper:
201
  def __init__(self, delay: float = 2.0, db_path: str = "/app/data/legal_scraper.db"):
 
 
202
  self.nlp = PersianNLPProcessor() if NLP_AVAILABLE else None
203
  self.session = requests.Session()
204
  self.delay = delay
 
226
  id INTEGER PRIMARY KEY AUTOINCREMENT,
227
  title TEXT NOT NULL,
228
  content TEXT NOT NULL,
229
+ source_url TEXT UNIQUE NOT NULL,
230
  document_type TEXT NOT NULL,
231
  date_published TEXT,
232
  date_scraped TEXT NOT NULL,
 
360
  score += 0.2
361
  return min(score, 1.0)
362
 
363
+ def scrape_real_sources(self, source_urls: List[str] = None, max_docs: int = 10) -> List[LegalDocument]:
364
  if not source_urls:
365
  source_urls = IRANIAN_LEGAL_SOURCES
366
  documents = []
main.py CHANGED
@@ -78,7 +78,7 @@ class OCRService:
78
  try:
79
  logger.info("Loading TrOCR model...")
80
  model_name = "microsoft/trocr-base-printed"
81
- self.processor = TrOCRProcessor.from_pretrained(model_name, cache_dir="/app/cache")
82
  self.model = VisionEncoderDecoderModel.from_pretrained(model_name, cache_dir="/app/cache")
83
  self.model_loaded = True
84
  logger.info("✅ TrOCR model loaded successfully")
 
78
  try:
79
  logger.info("Loading TrOCR model...")
80
  model_name = "microsoft/trocr-base-printed"
81
+ self.processor = TrOCRProcessor.from_pretrained(model_name, cache_dir="/app/cache", use_fast=True)
82
  self.model = VisionEncoderDecoderModel.from_pretrained(model_name, cache_dir="/app/cache")
83
  self.model_loaded = True
84
  logger.info("✅ TrOCR model loaded successfully")
requirements.txt CHANGED
@@ -1,3 +1,4 @@
 
1
  requests>=2.28.0
2
  beautifulsoup4>=4.11.0
3
  pandas>=2.0.0,<3.0.0
@@ -13,4 +14,5 @@ python-multipart>=0.0.6
13
  pillow>=9.0.0
14
  pymupdf>=1.21.0
15
  python-dotenv>=0.21.0
16
- plotly>=5.0.0
 
 
1
+ ```
2
  requests>=2.28.0
3
  beautifulsoup4>=4.11.0
4
  pandas>=2.0.0,<3.0.0
 
14
  pillow>=9.0.0
15
  pymupdf>=1.21.0
16
  python-dotenv>=0.21.0
17
+ plotly>=5.0.0
18
+ ```