Spaces:
Paused
Paused
| """ | |
| Document Models for Legal Dashboard OCR | |
| ===================================== | |
| Pydantic models and dataclasses for legal document data structures. | |
| """ | |
| from dataclasses import dataclass, field | |
| from typing import List, Optional, Dict, Any | |
| from datetime import datetime | |
| import uuid | |
| from pydantic import BaseModel, Field | |
| class LegalDocument: | |
| """Enhanced data class for legal documents with AI scoring""" | |
| id: Optional[str] = None | |
| title: str = "" | |
| document_number: str = "" | |
| publication_date: str = "" | |
| source: str = "" | |
| full_text: str = "" | |
| url: str = "" | |
| extracted_at: str = "" | |
| source_credibility: float = 0.0 | |
| document_quality: float = 0.0 | |
| final_score: float = 0.0 | |
| category: str = "" | |
| status: str = "pending" | |
| ai_confidence: float = 0.0 | |
| user_feedback: Optional[str] = None | |
| keywords: List[str] = field(default_factory=list) | |
| references: List[str] = field(default_factory=list) | |
| recency_score: float = 0.0 | |
| ocr_confidence: float = 0.0 | |
| language: str = "fa" # Persian by default | |
| file_path: Optional[str] = None | |
| file_size: Optional[int] = None | |
| processing_time: Optional[float] = None | |
| def __post_init__(self): | |
| if self.id is None: | |
| self.id = str(uuid.uuid4()) | |
| if self.extracted_at == "": | |
| self.extracted_at = datetime.now().isoformat() | |
| def to_dict(self) -> Dict[str, Any]: | |
| """Convert to dictionary""" | |
| return { | |
| "id": self.id, | |
| "title": self.title, | |
| "document_number": self.document_number, | |
| "publication_date": self.publication_date, | |
| "source": self.source, | |
| "full_text": self.full_text, | |
| "url": self.url, | |
| "extracted_at": self.extracted_at, | |
| "source_credibility": self.source_credibility, | |
| "document_quality": self.document_quality, | |
| "final_score": self.final_score, | |
| "category": self.category, | |
| "status": self.status, | |
| "ai_confidence": self.ai_confidence, | |
| "user_feedback": self.user_feedback, | |
| "keywords": self.keywords, | |
| "references": self.references, | |
| "recency_score": self.recency_score, | |
| "ocr_confidence": self.ocr_confidence, | |
| "language": self.language, | |
| "file_path": self.file_path, | |
| "file_size": self.file_size, | |
| "processing_time": self.processing_time | |
| } | |
| # Pydantic Models for API | |
| class DocumentCreate(BaseModel): | |
| """Model for creating a new document""" | |
| title: str = Field(..., description="Document title") | |
| document_number: str = Field("", description="Document number") | |
| publication_date: str = Field("", description="Publication date") | |
| source: str = Field("", description="Document source") | |
| full_text: str = Field("", description="Extracted text content") | |
| url: str = Field("", description="Document URL") | |
| category: str = Field("", description="Document category") | |
| language: str = Field("fa", description="Document language") | |
| class DocumentUpdate(BaseModel): | |
| """Model for updating a document""" | |
| title: Optional[str] = None | |
| document_number: Optional[str] = None | |
| publication_date: Optional[str] = None | |
| source: Optional[str] = None | |
| full_text: Optional[str] = None | |
| url: Optional[str] = None | |
| category: Optional[str] = None | |
| status: Optional[str] = None | |
| user_feedback: Optional[str] = None | |
| keywords: Optional[List[str]] = None | |
| references: Optional[List[str]] = None | |
| class DocumentResponse(BaseModel): | |
| """Model for document API responses""" | |
| id: str | |
| title: str | |
| document_number: str | |
| publication_date: str | |
| source: str | |
| full_text: str | |
| url: str | |
| extracted_at: str | |
| source_credibility: float | |
| document_quality: float | |
| final_score: float | |
| category: str | |
| status: str | |
| ai_confidence: float | |
| user_feedback: Optional[str] | |
| keywords: List[str] | |
| references: List[str] | |
| recency_score: float | |
| ocr_confidence: float | |
| language: str | |
| file_path: Optional[str] | |
| file_size: Optional[int] | |
| processing_time: Optional[float] | |
| class OCRRequest(BaseModel): | |
| """Model for OCR processing requests""" | |
| file_path: str = Field(..., description="Path to the PDF file") | |
| language: str = Field("fa", description="Document language") | |
| model_name: Optional[str] = Field(None, description="OCR model to use") | |
| class OCRResponse(BaseModel): | |
| """Model for OCR processing responses""" | |
| success: bool | |
| extracted_text: str | |
| confidence: float | |
| processing_time: float | |
| language_detected: str | |
| page_count: int | |
| error_message: Optional[str] = None | |
| class DashboardSummary(BaseModel): | |
| """Model for dashboard summary data""" | |
| total_documents: int | |
| processed_today: int | |
| average_score: float | |
| top_categories: List[Dict[str, Any]] | |
| recent_activity: List[Dict[str, Any]] | |
| system_status: Dict[str, bool] | |
| class AIFeedback(BaseModel): | |
| """Model for AI training feedback""" | |
| document_id: str = Field(..., description="Document ID") | |
| feedback_type: str = Field(..., description="Type of feedback") | |
| feedback_score: float = Field(..., description="Feedback score") | |
| feedback_text: str = Field("", description="Feedback text") | |
| class SearchFilters(BaseModel): | |
| """Model for document search filters""" | |
| category: Optional[str] = None | |
| status: Optional[str] = None | |
| min_score: Optional[float] = None | |
| max_score: Optional[float] = None | |
| source: Optional[str] = None | |
| date_from: Optional[str] = None | |
| date_to: Optional[str] = None | |
| language: Optional[str] = None | |
| limit: int = Field(50, description="Number of results to return") | |
| offset: int = Field(0, description="Number of results to skip") | |
| class PaginatedResponse(BaseModel): | |
| """Model for paginated API responses""" | |
| items: List[DocumentResponse] | |
| total: int | |
| page: int | |
| size: int | |
| pages: int | |