import os import tempfile import logging import traceback from pathlib import Path from typing import Dict, Any from datetime import datetime from fastapi import FastAPI, File, UploadFile, HTTPException, Request from fastapi.staticfiles import StaticFiles from fastapi.responses import HTMLResponse, FileResponse, JSONResponse from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel # تنظیم logging بر اساس متغیر محیطی LOG_LEVEL log_level = os.getenv("LOG_LEVEL", "INFO").upper() logging.basicConfig(level=getattr(logging, log_level, logging.INFO)) logger = logging.getLogger(__name__) # بارگذاری کتابخانههای پردازش PDF و تصویر try: import fitz # PyMuPDF from PIL import Image import numpy as np PDF_AVAILABLE = True logger.info("✅ PDF processing libraries loaded") except ImportError as e: PDF_AVAILABLE = False logger.warning(f"⚠️ PDF libraries not available: {e}") # بارگذاری مدلهای ML try: from transformers import TrOCRProcessor, VisionEncoderDecoderModel import torch ML_AVAILABLE = True logger.info("✅ ML libraries loaded") except ImportError as e: ML_AVAILABLE = False logger.warning(f"⚠️ ML libraries not available: {e}") # مدل پاسخ OCR class OCRResponse(BaseModel): success: bool text: str method: str metadata: Dict[str, Any] # مدل وضعیت سیستم class SystemStatus(BaseModel): status: str services: Dict[str, Any] timestamp: str # سرویس OCR class OCRService: def __init__(self): self.model = None self.processor = None self.model_loaded = False async def _load_model_async(self): try: logger.info("Loading TrOCR model...") model_name = "microsoft/trocr-base-printed" self.processor = TrOCRProcessor.from_pretrained(model_name) self.model = VisionEncoderDecoderModel.from_pretrained(model_name) self.model_loaded = True logger.info("✅ TrOCR model loaded successfully") except Exception as e: logger.error(f"❌ Failed to load TrOCR model: {e}") self.model_loaded = False async def extract_text_from_pdf(self, file_path: str) -> OCRResponse: if not PDF_AVAILABLE: return OCRResponse(success=False, text="", method="error", metadata={"error": "PDF processing not available"}) try: doc = fitz.open(file_path) pages_text = [] total_chars = 0 total_pages = doc.page_count for page_num in range(min(total_pages, 10)): page = doc[page_num] text = page.get_text() pages_text.append(text) total_chars += len(text) doc.close() full_text = "\n\n--- Page Break ---\n\n".join(pages_text) return OCRResponse( success=True, text=full_text, method="PyMuPDF", metadata={ "pages_processed": len(pages_text), "total_pages": total_pages, "total_characters": total_chars, "file_size_kb": os.path.getsize(file_path) / 1024 } ) except Exception as e: logger.error(f"PDF processing error: {e}") return OCRResponse(success=False, text="", method="error", metadata={"error": str(e)}) async def extract_text_from_image(self, file_path: str) -> OCRResponse: try: image = Image.open(file_path) if self.model_loaded and self.processor and self.model: pixel_values = self.processor(images=image, return_tensors="pt").pixel_values generated_ids = self.model.generate(pixel_values) generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0] return OCRResponse( success=True, text=generated_text, method="TrOCR", metadata={ "image_size": image.size, "image_mode": image.mode, "model": "microsoft/trocr-base-printed" } ) else: return OCRResponse( success=True, text=f"Image processed: {image.size} pixels, {image.mode} mode\nTrOCR model not loaded - text extraction limited", method="Basic", metadata={ "image_size": image.size, "image_mode": image.mode, "note": "TrOCR model not available" } ) except Exception as e: logger.error(f"Image processing error: {e}") return OCRResponse(success=False, text="", method="error", metadata={"error": str(e)}) ocr_service = OCRService() app = FastAPI( title="Legal Dashboard API", description="Advanced Legal Document Processing System", version="2.0.0", docs_url="/api/docs", redoc_url="/api/redoc" ) app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # تنظیم مسیر دایرکتوری فرانتاند استاتیک BASE_DIR = Path(__file__).parent frontend_dir = BASE_DIR / "frontend" if frontend_dir.exists(): logger.info(f"✅ Frontend directory found: {frontend_dir}") app.mount("/static", StaticFiles(directory=frontend_dir), name="static") else: logger.warning("⚠️ Frontend directory not found. UI will not load correctly.") @app.on_event("startup") async def startup_event(): if ML_AVAILABLE: try: logger.info("🚀 Loading OCR models on startup...") await ocr_service._load_model_async() except Exception as e: logger.error(f"❌ Failed to load models on startup: {e}") @app.get("/", response_class=HTMLResponse) async def root(): html_file = frontend_dir / "index.html" if html_file.exists(): return FileResponse(html_file) return HTMLResponse("""
Please ensure 'frontend/index.html' exists in the project root.
""") @app.get("/health") async def health_check(): return { "status": "healthy", "message": "Legal Dashboard is running", "timestamp": datetime.now().isoformat(), "services": { "pdf_processing": PDF_AVAILABLE, "ml_models": ML_AVAILABLE, "ocr_model_loaded": ocr_service.model_loaded } } @app.get("/system/status", response_model=SystemStatus) async def get_system_status(): return SystemStatus( status="healthy", services={ "pdf_processing": { "available": PDF_AVAILABLE, "status": "✅ Available" if PDF_AVAILABLE else "❌ Not Available" }, "ml_models": { "available": ML_AVAILABLE, "status": "✅ Available" if ML_AVAILABLE else "❌ Not Available" }, "ocr_model": { "loaded": ocr_service.model_loaded, "status": "✅ Loaded" if ocr_service.model_loaded else "⏳ Loading..." if ML_AVAILABLE else "❌ Not Available" } }, timestamp=datetime.now().isoformat() ) @app.post("/api/ocr/extract-pdf", response_model=OCRResponse) async def extract_pdf_text(file: UploadFile = File(...)): if not file.filename.lower().endswith('.pdf'): raise HTTPException(status_code=400, detail="File must be a PDF") temp_path = None try: with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file: content = await file.read() temp_file.write(content) temp_path = temp_file.name return await ocr_service.extract_text_from_pdf(temp_path) finally: if temp_path and os.path.exists(temp_path): os.unlink(temp_path) @app.post("/api/ocr/extract-image", response_model=OCRResponse) async def extract_image_text(file: UploadFile = File(...)): allowed_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff'] if not any(file.filename.lower().endswith(ext) for ext in allowed_extensions): raise HTTPException(status_code=400, detail="File must be an image (JPG, PNG, BMP, TIFF)") temp_path = None try: file_extension = Path(file.filename).suffix with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file: content = await file.read() temp_file.write(content) temp_path = temp_file.name return await ocr_service.extract_text_from_image(temp_path) finally: if temp_path and os.path.exists(temp_path): os.unlink(temp_path) @app.get("/api/test") async def test_endpoint(): return { "message": "API is working!", "pdf_available": PDF_AVAILABLE, "ml_available": ML_AVAILABLE, "ocr_model_loaded": ocr_service.model_loaded, "timestamp": datetime.now().isoformat() } @app.exception_handler(Exception) async def global_exception_handler(request: Request, exc: Exception): logger.error(f"Global exception: {exc}") logger.error(traceback.format_exc()) return JSONResponse( status_code=500, content={ "error": "Internal server error", "message": str(exc), "path": str(request.url) } ) if __name__ == "__main__": import uvicorn uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=False, log_level="info")