Spaces:

Really-amin
/

Hoghoghi

Paused

Really-amin commited on Aug 31

Commit

8acdbd0

verified ·

1 Parent(s): 91b299f

Upload 4 files

Files changed (4) hide show

dockerfile CHANGED Viewed

@@ -29,7 +29,7 @@ RUN mkdir -p /app/cache && \
                AutoModel.from_pretrained('HooshvareLab/bert-fa-base-uncased', cache_dir='/app/cache'); \
                AutoTokenizer.from_pretrained('HooshvareLab/bert-fa-base-uncased', cache_dir='/app/cache')" || true
 RUN python -c "from transformers import TrOCRProcessor, VisionEncoderDecoderModel; \
-               TrOCRProcessor.from_pretrained('microsoft/trocr-base-printed', cache_dir='/app/cache'); \
                VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-printed', cache_dir='/app/cache')" || true
 # Stage 2: Production

                AutoModel.from_pretrained('HooshvareLab/bert-fa-base-uncased', cache_dir='/app/cache'); \
                AutoTokenizer.from_pretrained('HooshvareLab/bert-fa-base-uncased', cache_dir='/app/cache')" || true
 RUN python -c "from transformers import TrOCRProcessor, VisionEncoderDecoderModel; \
+               TrOCRProcessor.from_pretrained('microsoft/trocr-base-printed', cache_dir='/app/cache', use_fast=True); \
                VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-printed', cache_dir='/app/cache')" || true
 # Stage 2: Production

enhanced_legal_scraper.py CHANGED Viewed

@@ -26,9 +26,13 @@ except ImportError as e:
     NLP_AVAILABLE = False
     logging.warning(f"⚠️ NLP libraries not available: {e}")
-# Create log directory
 log_dir = '/app/logs'
 os.makedirs(log_dir, exist_ok=True)
 # Configure logging
 logging.basicConfig(
@@ -195,8 +199,6 @@ class PersianNLPProcessor:
 class EnhancedLegalScraper:
     def __init__(self, delay: float = 2.0, db_path: str = "/app/data/legal_scraper.db"):
-        # Create data directory
-        os.makedirs('/app/data', exist_ok=True)
         self.nlp = PersianNLPProcessor() if NLP_AVAILABLE else None
         self.session = requests.Session()
         self.delay = delay
@@ -224,7 +226,7 @@ class EnhancedLegalScraper:
                     id INTEGER PRIMARY KEY AUTOINCREMENT,
                     title TEXT NOT NULL,
                     content TEXT NOT NULL,
-                    source_url TEXT UNIQUE NOT NOT NULL,
                     document_type TEXT NOT NULL,
                     date_published TEXT,
                     date_scraped TEXT NOT NULL,
@@ -358,7 +360,7 @@ class EnhancedLegalScraper:
             score += 0.2
         return min(score, 1.0)
-    def scrape_real_sources(self, source_urls: List[str] = None, max_docs: int = 20) -> List[LegalDocument]:
         if not source_urls:
             source_urls = IRANIAN_LEGAL_SOURCES
         documents = []

     NLP_AVAILABLE = False
     logging.warning(f"⚠️ NLP libraries not available: {e}")
+# Create required directories
 log_dir = '/app/logs'
+data_dir = '/app/data'
+cache_dir = '/app/cache'
 os.makedirs(log_dir, exist_ok=True)
+os.makedirs(data_dir, exist_ok=True)
+os.makedirs(cache_dir, exist_ok=True)
 # Configure logging
 logging.basicConfig(
 class EnhancedLegalScraper:
     def __init__(self, delay: float = 2.0, db_path: str = "/app/data/legal_scraper.db"):
         self.nlp = PersianNLPProcessor() if NLP_AVAILABLE else None
         self.session = requests.Session()
         self.delay = delay
                     id INTEGER PRIMARY KEY AUTOINCREMENT,
                     title TEXT NOT NULL,
                     content TEXT NOT NULL,
+                    source_url TEXT UNIQUE NOT NULL,
                     document_type TEXT NOT NULL,
                     date_published TEXT,
                     date_scraped TEXT NOT NULL,
             score += 0.2
         return min(score, 1.0)
+    def scrape_real_sources(self, source_urls: List[str] = None, max_docs: int = 10) -> List[LegalDocument]:
         if not source_urls:
             source_urls = IRANIAN_LEGAL_SOURCES
         documents = []

main.py CHANGED Viewed

@@ -78,7 +78,7 @@ class OCRService:
         try:
             logger.info("Loading TrOCR model...")
             model_name = "microsoft/trocr-base-printed"
-            self.processor = TrOCRProcessor.from_pretrained(model_name, cache_dir="/app/cache")
             self.model = VisionEncoderDecoderModel.from_pretrained(model_name, cache_dir="/app/cache")
             self.model_loaded = True
             logger.info("✅ TrOCR model loaded successfully")

         try:
             logger.info("Loading TrOCR model...")
             model_name = "microsoft/trocr-base-printed"
+            self.processor = TrOCRProcessor.from_pretrained(model_name, cache_dir="/app/cache", use_fast=True)
             self.model = VisionEncoderDecoderModel.from_pretrained(model_name, cache_dir="/app/cache")
             self.model_loaded = True
             logger.info("✅ TrOCR model loaded successfully")

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 requests>=2.28.0
 beautifulsoup4>=4.11.0
 pandas>=2.0.0,<3.0.0
@@ -13,4 +14,5 @@ python-multipart>=0.0.6
 pillow>=9.0.0
 pymupdf>=1.21.0
 python-dotenv>=0.21.0
-plotly>=5.0.0

+```
 requests>=2.28.0
 beautifulsoup4>=4.11.0
 pandas>=2.0.0,<3.0.0
 pillow>=9.0.0
 pymupdf>=1.21.0
 python-dotenv>=0.21.0
+plotly>=5.0.0
+```