Spaces:
				
			
			
	
			
			
		Paused
		
	
	
	
			
			
	
	
	
	
		
		
		Paused
		
	| """ | |
| Advanced Web Scraping Service | |
| ============================= | |
| Production-grade web scraping service with multiple strategies, async processing, | |
| and comprehensive error handling for the Legal Dashboard OCR system. | |
| """ | |
| import asyncio | |
| import aiohttp | |
| import logging | |
| from datetime import datetime, timezone, timedelta | |
| from typing import Dict, List, Optional, Any, Union | |
| from dataclasses import dataclass, asdict | |
| from enum import Enum | |
| import json | |
| import re | |
| from urllib.parse import urlparse, urljoin | |
| from bs4 import BeautifulSoup | |
| import hashlib | |
| from concurrent.futures import ThreadPoolExecutor | |
| import time | |
| from pydantic import BaseModel, Field | |
| import sqlite3 | |
| from pathlib import Path | |
| logger = logging.getLogger(__name__) | |
| class ScrapingStrategy(Enum): | |
| """Available scraping strategies""" | |
| GENERAL = "general" | |
| LEGAL_DOCUMENTS = "legal_documents" | |
| NEWS_ARTICLES = "news_articles" | |
| ACADEMIC_PAPERS = "academic_papers" | |
| GOVERNMENT_SITES = "government_sites" | |
| CUSTOM = "custom" | |
| class ProcessingStatus(Enum): | |
| """Processing status for scraped items""" | |
| PENDING = "pending" | |
| PROCESSING = "processing" | |
| COMPLETED = "completed" | |
| FAILED = "failed" | |
| RATED = "rated" | |
| class ScrapedItem: | |
| """Data structure for scraped items""" | |
| id: str | |
| url: str | |
| title: str | |
| content: str | |
| metadata: Dict[str, Any] | |
| timestamp: datetime | |
| source_url: str | |
| rating_score: float = 0.0 | |
| processing_status: ProcessingStatus = ProcessingStatus.PENDING | |
| error_message: Optional[str] = None | |
| strategy_used: ScrapingStrategy = ScrapingStrategy.GENERAL | |
| content_hash: str = "" | |
| word_count: int = 0 | |
| language: str = "unknown" | |
| domain: str = "" | |
| def to_dict(self) -> Dict[str, Any]: | |
| """Convert to dictionary for storage""" | |
| data = asdict(self) | |
| data['timestamp'] = self.timestamp.isoformat() | |
| data['processing_status'] = self.processing_status.value | |
| data['strategy_used'] = self.strategy_used.value | |
| return data | |
| class ScrapingJob(BaseModel): | |
| """Scraping job configuration""" | |
| job_id: str | |
| urls: List[str] | |
| strategy: ScrapingStrategy = ScrapingStrategy.GENERAL | |
| keywords: Optional[List[str]] = None | |
| content_types: Optional[List[str]] = None | |
| max_depth: int = 1 | |
| delay_between_requests: float = 1.0 | |
| timeout: int = 30 | |
| created_at: datetime = Field( | |
| default_factory=lambda: datetime.now(timezone.utc)) | |
| status: str = "pending" | |
| total_items: int = 0 | |
| completed_items: int = 0 | |
| failed_items: int = 0 | |
| class ScrapingService: | |
| """Advanced web scraping service with multiple strategies""" | |
| def __init__(self, db_path: str = "legal_documents.db"): | |
| self.db_path = db_path | |
| self.active_jobs: Dict[str, ScrapingJob] = {} | |
| self.session: Optional[aiohttp.ClientSession] = None | |
| self.executor = ThreadPoolExecutor(max_workers=10) | |
| self._initialize_database() | |
| def _initialize_database(self): | |
| """Initialize database tables for scraping data""" | |
| try: | |
| with sqlite3.connect(self.db_path) as conn: | |
| cursor = conn.cursor() | |
| # Create scraped_items table | |
| cursor.execute(""" | |
| CREATE TABLE IF NOT EXISTS scraped_items ( | |
| id TEXT PRIMARY KEY, | |
| url TEXT NOT NULL, | |
| title TEXT, | |
| content TEXT, | |
| metadata TEXT, | |
| timestamp TEXT, | |
| source_url TEXT, | |
| rating_score REAL DEFAULT 0.0, | |
| processing_status TEXT DEFAULT 'pending', | |
| error_message TEXT, | |
| strategy_used TEXT, | |
| content_hash TEXT, | |
| word_count INTEGER DEFAULT 0, | |
| language TEXT DEFAULT 'unknown', | |
| domain TEXT | |
| ) | |
| """) | |
| # Create scraping_jobs table | |
| cursor.execute(""" | |
| CREATE TABLE IF NOT EXISTS scraping_jobs ( | |
| job_id TEXT PRIMARY KEY, | |
| urls TEXT, | |
| strategy TEXT, | |
| keywords TEXT, | |
| content_types TEXT, | |
| max_depth INTEGER DEFAULT 1, | |
| delay_between_requests REAL DEFAULT 1.0, | |
| timeout INTEGER DEFAULT 30, | |
| created_at TEXT, | |
| status TEXT DEFAULT 'pending', | |
| total_items INTEGER DEFAULT 0, | |
| completed_items INTEGER DEFAULT 0, | |
| failed_items INTEGER DEFAULT 0 | |
| ) | |
| """) | |
| conn.commit() | |
| logger.info("✅ Scraping database initialized successfully") | |
| except Exception as e: | |
| logger.error(f"❌ Failed to initialize scraping database: {e}") | |
| async def start_session(self): | |
| """Start aiohttp session""" | |
| if not self.session: | |
| timeout = aiohttp.ClientTimeout(total=30) | |
| self.session = aiohttp.ClientSession( | |
| timeout=timeout, | |
| headers={ | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' | |
| } | |
| ) | |
| async def close_session(self): | |
| """Close aiohttp session""" | |
| if self.session: | |
| await self.session.close() | |
| self.session = None | |
| def _generate_job_id(self) -> str: | |
| """Generate unique job ID""" | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| return f"scrape_job_{timestamp}_{hashlib.md5(str(time.time()).encode()).hexdigest()[:8]}" | |
| def _generate_item_id(self, url: str) -> str: | |
| """Generate unique item ID based on URL""" | |
| url_hash = hashlib.md5(url.encode()).hexdigest() | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| return f"item_{timestamp}_{url_hash[:8]}" | |
| def _extract_domain(self, url: str) -> str: | |
| """Extract domain from URL""" | |
| try: | |
| parsed = urlparse(url) | |
| return parsed.netloc | |
| except: | |
| return "unknown" | |
| def _calculate_content_hash(self, content: str) -> str: | |
| """Calculate hash of content for deduplication""" | |
| return hashlib.md5(content.encode()).hexdigest() | |
| def _count_words(self, text: str) -> int: | |
| """Count words in text""" | |
| return len(text.split()) | |
| def _detect_language(self, text: str) -> str: | |
| """Simple language detection (can be enhanced)""" | |
| # Simple Persian detection | |
| persian_chars = re.findall(r'[\u0600-\u06FF]', text) | |
| if len(persian_chars) > len(text) * 0.3: | |
| return "persian" | |
| return "english" | |
| async def scrape_url(self, url: str, strategy: ScrapingStrategy, job_id: str) -> Optional[ScrapedItem]: | |
| """Scrape a single URL with specified strategy""" | |
| try: | |
| await self.start_session() | |
| async with self.session.get(url) as response: | |
| if response.status != 200: | |
| logger.warning( | |
| f"Failed to fetch {url}: Status {response.status}") | |
| return None | |
| content_type = response.headers.get('content-type', '') | |
| if 'text/html' not in content_type: | |
| logger.info(f"Skipping non-HTML content: {url}") | |
| return None | |
| html_content = await response.text() | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| # Extract content based on strategy | |
| title, content = await self._extract_content_by_strategy(soup, strategy) | |
| if not content or len(content.strip()) < 50: | |
| logger.warning(f"Insufficient content from {url}") | |
| return None | |
| # Create scraped item | |
| item_id = self._generate_item_id(url) | |
| domain = self._extract_domain(url) | |
| content_hash = self._calculate_content_hash(content) | |
| word_count = self._count_words(content) | |
| language = self._detect_language(content) | |
| item = ScrapedItem( | |
| id=item_id, | |
| url=url, | |
| title=title or "No Title", | |
| content=content, | |
| metadata={ | |
| 'content_type': content_type, | |
| 'response_time': response.headers.get('server-timing', ''), | |
| 'encoding': response.encoding, | |
| 'job_id': job_id | |
| }, | |
| timestamp=datetime.now(timezone.utc), | |
| source_url=url, | |
| strategy_used=strategy, | |
| content_hash=content_hash, | |
| word_count=word_count, | |
| language=language, | |
| domain=domain, | |
| processing_status=ProcessingStatus.COMPLETED | |
| ) | |
| # Store in database | |
| await self._store_scraped_item(item) | |
| logger.info( | |
| f"✅ Successfully scraped {url} ({word_count} words)") | |
| return item | |
| except asyncio.TimeoutError: | |
| logger.error(f"Timeout scraping {url}") | |
| return None | |
| except Exception as e: | |
| logger.error(f"Error scraping {url}: {e}") | |
| return None | |
| async def _extract_content_by_strategy(self, soup: BeautifulSoup, strategy: ScrapingStrategy) -> tuple[str, str]: | |
| """Extract content based on scraping strategy""" | |
| title = "" | |
| content = "" | |
| try: | |
| # Extract title | |
| title_tag = soup.find('title') | |
| if title_tag: | |
| title = title_tag.get_text().strip() | |
| # Remove unwanted elements | |
| for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']): | |
| element.decompose() | |
| if strategy == ScrapingStrategy.LEGAL_DOCUMENTS: | |
| # Focus on legal document content | |
| legal_selectors = [ | |
| 'article', '.legal-content', '.document-content', | |
| '.legal-text', '.document-text', 'main' | |
| ] | |
| for selector in legal_selectors: | |
| elements = soup.select(selector) | |
| if elements: | |
| content = ' '.join([elem.get_text().strip() | |
| for elem in elements]) | |
| break | |
| if not content: | |
| # Fallback to body content | |
| body = soup.find('body') | |
| if body: | |
| content = body.get_text().strip() | |
| elif strategy == ScrapingStrategy.NEWS_ARTICLES: | |
| # Focus on news article content | |
| news_selectors = [ | |
| 'article', '.article-content', '.news-content', | |
| '.story-content', '.post-content', 'main' | |
| ] | |
| for selector in news_selectors: | |
| elements = soup.select(selector) | |
| if elements: | |
| content = ' '.join([elem.get_text().strip() | |
| for elem in elements]) | |
| break | |
| if not content: | |
| # Fallback to body content | |
| body = soup.find('body') | |
| if body: | |
| content = body.get_text().strip() | |
| elif strategy == ScrapingStrategy.ACADEMIC_PAPERS: | |
| # Focus on academic content | |
| academic_selectors = [ | |
| '.abstract', '.content', '.paper-content', | |
| 'article', '.research-content', 'main' | |
| ] | |
| for selector in academic_selectors: | |
| elements = soup.select(selector) | |
| if elements: | |
| content = ' '.join([elem.get_text().strip() | |
| for elem in elements]) | |
| break | |
| if not content: | |
| # Fallback to body content | |
| body = soup.find('body') | |
| if body: | |
| content = body.get_text().strip() | |
| else: | |
| # General strategy - extract all text | |
| body = soup.find('body') | |
| if body: | |
| content = body.get_text().strip() | |
| # Clean up content | |
| content = re.sub(r'\s+', ' ', content).strip() | |
| except Exception as e: | |
| logger.error(f"Error extracting content: {e}") | |
| content = "" | |
| return title, content | |
| async def _store_scraped_item(self, item: ScrapedItem): | |
| """Store scraped item in database""" | |
| try: | |
| with sqlite3.connect(self.db_path) as conn: | |
| cursor = conn.cursor() | |
| cursor.execute(""" | |
| INSERT OR REPLACE INTO scraped_items | |
| (id, url, title, content, metadata, timestamp, source_url, | |
| rating_score, processing_status, error_message, strategy_used, | |
| content_hash, word_count, language, domain) | |
| VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) | |
| """, ( | |
| item.id, item.url, item.title, item.content, | |
| json.dumps(item.metadata), item.timestamp.isoformat(), | |
| item.source_url, item.rating_score, item.processing_status.value, | |
| item.error_message, item.strategy_used.value, item.content_hash, | |
| item.word_count, item.language, item.domain | |
| )) | |
| conn.commit() | |
| except Exception as e: | |
| logger.error(f"Error storing scraped item: {e}") | |
| async def start_scraping_job(self, urls: List[str], strategy: ScrapingStrategy = ScrapingStrategy.GENERAL, | |
| keywords: Optional[List[str]] = None, content_types: Optional[List[str]] = None, | |
| max_depth: int = 1, delay: float = 1.0) -> str: | |
| """Start a new scraping job""" | |
| job_id = self._generate_job_id() | |
| job = ScrapingJob( | |
| job_id=job_id, | |
| urls=urls, | |
| strategy=strategy, | |
| keywords=keywords, | |
| content_types=content_types, | |
| max_depth=max_depth, | |
| delay_between_requests=delay, | |
| total_items=len(urls) | |
| ) | |
| self.active_jobs[job_id] = job | |
| # Store job in database | |
| await self._store_job(job) | |
| # Start scraping in background | |
| asyncio.create_task(self._execute_scraping_job(job)) | |
| logger.info(f"🚀 Started scraping job {job_id} with {len(urls)} URLs") | |
| return job_id | |
| async def _execute_scraping_job(self, job: ScrapingJob): | |
| """Execute scraping job asynchronously""" | |
| try: | |
| job.status = "processing" | |
| await self._update_job_status(job) | |
| for i, url in enumerate(job.urls): | |
| try: | |
| # Add delay between requests | |
| if i > 0 and job.delay_between_requests > 0: | |
| await asyncio.sleep(job.delay_between_requests) | |
| item = await self.scrape_url(url, job.strategy, job.job_id) | |
| if item: | |
| job.completed_items += 1 | |
| else: | |
| job.failed_items += 1 | |
| await self._update_job_status(job) | |
| except Exception as e: | |
| logger.error(f"Error processing URL {url}: {e}") | |
| job.failed_items += 1 | |
| await self._update_job_status(job) | |
| job.status = "completed" | |
| await self._update_job_status(job) | |
| logger.info(f"✅ Completed scraping job {job.job_id}") | |
| except Exception as e: | |
| logger.error(f"❌ Error in scraping job {job.job_id}: {e}") | |
| job.status = "failed" | |
| await self._update_job_status(job) | |
| async def _store_job(self, job: ScrapingJob): | |
| """Store job in database""" | |
| try: | |
| with sqlite3.connect(self.db_path) as conn: | |
| cursor = conn.cursor() | |
| cursor.execute(""" | |
| INSERT OR REPLACE INTO scraping_jobs | |
| (job_id, urls, strategy, keywords, content_types, max_depth, | |
| delay_between_requests, timeout, created_at, status, | |
| total_items, completed_items, failed_items) | |
| VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) | |
| """, ( | |
| job.job_id, json.dumps(job.urls), job.strategy.value, | |
| json.dumps(job.keywords) if job.keywords else None, | |
| json.dumps( | |
| job.content_types) if job.content_types else None, | |
| job.max_depth, job.delay_between_requests, job.timeout, | |
| job.created_at.isoformat(), job.status, job.total_items, | |
| job.completed_items, job.failed_items | |
| )) | |
| conn.commit() | |
| except Exception as e: | |
| logger.error(f"Error storing job: {e}") | |
| async def _update_job_status(self, job: ScrapingJob): | |
| """Update job status in database""" | |
| try: | |
| with sqlite3.connect(self.db_path) as conn: | |
| cursor = conn.cursor() | |
| cursor.execute(""" | |
| UPDATE scraping_jobs | |
| SET status = ?, completed_items = ?, failed_items = ? | |
| WHERE job_id = ? | |
| """, (job.status, job.completed_items, job.failed_items, job.job_id)) | |
| conn.commit() | |
| except Exception as e: | |
| logger.error(f"Error updating job status: {e}") | |
| async def get_job_status(self, job_id: str) -> Optional[Dict[str, Any]]: | |
| """Get status of a scraping job""" | |
| if job_id in self.active_jobs: | |
| job = self.active_jobs[job_id] | |
| return { | |
| 'job_id': job.job_id, | |
| 'status': job.status, | |
| 'total_items': job.total_items, | |
| 'completed_items': job.completed_items, | |
| 'failed_items': job.failed_items, | |
| 'progress': (job.completed_items + job.failed_items) / job.total_items if job.total_items > 0 else 0, | |
| 'created_at': job.created_at.isoformat(), | |
| 'strategy': job.strategy.value | |
| } | |
| return None | |
| async def get_all_jobs(self) -> List[Dict[str, Any]]: | |
| """Get all scraping jobs""" | |
| jobs = [] | |
| for job in self.active_jobs.values(): | |
| jobs.append(await self.get_job_status(job.job_id)) | |
| return [job for job in jobs if job is not None] | |
| async def get_scraped_items(self, job_id: Optional[str] = None, | |
| limit: int = 100, offset: int = 0) -> List[Dict[str, Any]]: | |
| """Get scraped items with optional filtering""" | |
| try: | |
| with sqlite3.connect(self.db_path) as conn: | |
| cursor = conn.cursor() | |
| query = """ | |
| SELECT id, url, title, content, metadata, timestamp, source_url, | |
| rating_score, processing_status, error_message, strategy_used, | |
| content_hash, word_count, language, domain | |
| FROM scraped_items | |
| """ | |
| params = [] | |
| if job_id: | |
| query += " WHERE metadata LIKE ?" | |
| params.append(f'%"job_id": "{job_id}"%') | |
| query += " ORDER BY timestamp DESC LIMIT ? OFFSET ?" | |
| params.extend([limit, offset]) | |
| cursor.execute(query, params) | |
| rows = cursor.fetchall() | |
| items = [] | |
| for row in rows: | |
| item = { | |
| 'id': row[0], | |
| 'url': row[1], | |
| 'title': row[2], | |
| # Truncate content | |
| 'content': row[3][:500] + "..." if len(row[3]) > 500 else row[3], | |
| 'metadata': json.loads(row[4]) if row[4] else {}, | |
| 'timestamp': row[5], | |
| 'source_url': row[6], | |
| 'rating_score': row[7], | |
| 'processing_status': row[8], | |
| 'error_message': row[9], | |
| 'strategy_used': row[10], | |
| 'content_hash': row[11], | |
| 'word_count': row[12], | |
| 'language': row[13], | |
| 'domain': row[14] | |
| } | |
| items.append(item) | |
| return items | |
| except Exception as e: | |
| logger.error(f"Error retrieving scraped items: {e}") | |
| return [] | |
| async def get_scraping_statistics(self) -> Dict[str, Any]: | |
| """Get scraping statistics""" | |
| try: | |
| with sqlite3.connect(self.db_path) as conn: | |
| cursor = conn.cursor() | |
| # Total items | |
| cursor.execute("SELECT COUNT(*) FROM scraped_items") | |
| total_items = cursor.fetchone()[0] | |
| # Items by status | |
| cursor.execute(""" | |
| SELECT processing_status, COUNT(*) | |
| FROM scraped_items | |
| GROUP BY processing_status | |
| """) | |
| status_counts = dict(cursor.fetchall()) | |
| # Items by language | |
| cursor.execute(""" | |
| SELECT language, COUNT(*) | |
| FROM scraped_items | |
| GROUP BY language | |
| """) | |
| language_counts = dict(cursor.fetchall()) | |
| # Average rating | |
| cursor.execute( | |
| "SELECT AVG(rating_score) FROM scraped_items WHERE rating_score > 0") | |
| avg_rating = cursor.fetchone()[0] or 0 | |
| # Active jobs | |
| active_jobs = len( | |
| [j for j in self.active_jobs.values() if j.status == "processing"]) | |
| return { | |
| 'total_items': total_items, | |
| 'status_distribution': status_counts, | |
| 'language_distribution': language_counts, | |
| 'average_rating': round(avg_rating, 2), | |
| 'active_jobs': active_jobs, | |
| 'total_jobs': len(self.active_jobs) | |
| } | |
| except Exception as e: | |
| logger.error(f"Error getting scraping statistics: {e}") | |
| return {} | |
| async def cleanup_old_jobs(self, days: int = 7): | |
| """Clean up old completed jobs""" | |
| try: | |
| cutoff_date = datetime.now(timezone.utc) - timedelta(days=days) | |
| # Remove old jobs from memory | |
| jobs_to_remove = [] | |
| for job_id, job in self.active_jobs.items(): | |
| if job.status in ["completed", "failed"] and job.created_at < cutoff_date: | |
| jobs_to_remove.append(job_id) | |
| for job_id in jobs_to_remove: | |
| del self.active_jobs[job_id] | |
| logger.info(f"Cleaned up {len(jobs_to_remove)} old jobs") | |
| except Exception as e: | |
| logger.error(f"Error cleaning up old jobs: {e}") | |