File size: 6,197 Bytes
922c3ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
"""

Document Models for Legal Dashboard OCR

=====================================



Pydantic models and dataclasses for legal document data structures.

"""

from dataclasses import dataclass, field
from typing import List, Optional, Dict, Any
from datetime import datetime
import uuid
from pydantic import BaseModel, Field


@dataclass
class LegalDocument:
    """Enhanced data class for legal documents with AI scoring"""
    id: Optional[str] = None
    title: str = ""
    document_number: str = ""
    publication_date: str = ""
    source: str = ""
    full_text: str = ""
    url: str = ""
    extracted_at: str = ""
    source_credibility: float = 0.0
    document_quality: float = 0.0
    final_score: float = 0.0
    category: str = ""
    status: str = "pending"
    ai_confidence: float = 0.0
    user_feedback: Optional[str] = None
    keywords: List[str] = field(default_factory=list)
    references: List[str] = field(default_factory=list)
    recency_score: float = 0.0
    ocr_confidence: float = 0.0
    language: str = "fa"  # Persian by default
    file_path: Optional[str] = None
    file_size: Optional[int] = None
    processing_time: Optional[float] = None

    def __post_init__(self):
        if self.id is None:
            self.id = str(uuid.uuid4())
        if self.extracted_at == "":
            self.extracted_at = datetime.now().isoformat()

    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary"""
        return {
            "id": self.id,
            "title": self.title,
            "document_number": self.document_number,
            "publication_date": self.publication_date,
            "source": self.source,
            "full_text": self.full_text,
            "url": self.url,
            "extracted_at": self.extracted_at,
            "source_credibility": self.source_credibility,
            "document_quality": self.document_quality,
            "final_score": self.final_score,
            "category": self.category,
            "status": self.status,
            "ai_confidence": self.ai_confidence,
            "user_feedback": self.user_feedback,
            "keywords": self.keywords,
            "references": self.references,
            "recency_score": self.recency_score,
            "ocr_confidence": self.ocr_confidence,
            "language": self.language,
            "file_path": self.file_path,
            "file_size": self.file_size,
            "processing_time": self.processing_time
        }


# Pydantic Models for API
class DocumentCreate(BaseModel):
    """Model for creating a new document"""
    title: str = Field(..., description="Document title")
    document_number: str = Field("", description="Document number")
    publication_date: str = Field("", description="Publication date")
    source: str = Field("", description="Document source")
    full_text: str = Field("", description="Extracted text content")
    url: str = Field("", description="Document URL")
    category: str = Field("", description="Document category")
    language: str = Field("fa", description="Document language")


class DocumentUpdate(BaseModel):
    """Model for updating a document"""
    title: Optional[str] = None
    document_number: Optional[str] = None
    publication_date: Optional[str] = None
    source: Optional[str] = None
    full_text: Optional[str] = None
    url: Optional[str] = None
    category: Optional[str] = None
    status: Optional[str] = None
    user_feedback: Optional[str] = None
    keywords: Optional[List[str]] = None
    references: Optional[List[str]] = None


class DocumentResponse(BaseModel):
    """Model for document API responses"""
    id: str
    title: str
    document_number: str
    publication_date: str
    source: str
    full_text: str
    url: str
    extracted_at: str
    source_credibility: float
    document_quality: float
    final_score: float
    category: str
    status: str
    ai_confidence: float
    user_feedback: Optional[str]
    keywords: List[str]
    references: List[str]
    recency_score: float
    ocr_confidence: float
    language: str
    file_path: Optional[str]
    file_size: Optional[int]
    processing_time: Optional[float]


class OCRRequest(BaseModel):
    """Model for OCR processing requests"""
    file_path: str = Field(..., description="Path to the PDF file")
    language: str = Field("fa", description="Document language")
    model_name: Optional[str] = Field(None, description="OCR model to use")


class OCRResponse(BaseModel):
    """Model for OCR processing responses"""
    success: bool
    extracted_text: str
    confidence: float
    processing_time: float
    language_detected: str
    page_count: int
    error_message: Optional[str] = None


class DashboardSummary(BaseModel):
    """Model for dashboard summary data"""
    total_documents: int
    processed_today: int
    average_score: float
    top_categories: List[Dict[str, Any]]
    recent_activity: List[Dict[str, Any]]
    system_status: Dict[str, bool]


class AIFeedback(BaseModel):
    """Model for AI training feedback"""
    document_id: str = Field(..., description="Document ID")
    feedback_type: str = Field(..., description="Type of feedback")
    feedback_score: float = Field(..., description="Feedback score")
    feedback_text: str = Field("", description="Feedback text")


class SearchFilters(BaseModel):
    """Model for document search filters"""
    category: Optional[str] = None
    status: Optional[str] = None
    min_score: Optional[float] = None
    max_score: Optional[float] = None
    source: Optional[str] = None
    date_from: Optional[str] = None
    date_to: Optional[str] = None
    language: Optional[str] = None
    limit: int = Field(50, description="Number of results to return")
    offset: int = Field(0, description="Number of results to skip")


class PaginatedResponse(BaseModel):
    """Model for paginated API responses"""
    items: List[DocumentResponse]
    total: int
    page: int
    size: int
    pages: int