Spaces:

Really-amin
/

Hoghoghi

Paused

App Files Files Community

Really-amin commited on Aug 4

Commit

a3783e0

verified ·

1 Parent(s): 48b1974

Update app/main.py

Browse files

Files changed (1) hide show

app/main.py +22 -147

app/main.py CHANGED Viewed

@@ -3,7 +3,7 @@ import tempfile
 import logging
 import traceback
 from pathlib import Path
-from typing import Optional, Dict, Any
 import asyncio
 from datetime import datetime
@@ -40,15 +40,6 @@ except ImportError as e:
 # Data models
 class OCRResponse(BaseModel):
-    """
-    OCR processing response model
-    Fields:
-    - success: Whether the OCR processing was successful
-    - text: Extracted text content
-    - method: Processing method used (PyMuPDF, TrOCR, Basic, etc.)
-    - metadata: Additional processing information (pages, file size, image dimensions, etc.)
-    """
     success: bool
     text: str
     method: str
@@ -71,43 +62,30 @@ class OCRService:
         try:
             logger.info("Loading TrOCR model...")
             model_name = "microsoft/trocr-base-printed"
             self.processor = TrOCRProcessor.from_pretrained(model_name)
             self.model = VisionEncoderDecoderModel.from_pretrained(model_name)
             self.model_loaded = True
             logger.info("✅ TrOCR model loaded successfully")
         except Exception as e:
             logger.error(f"❌ Failed to load TrOCR model: {e}")
             self.model_loaded = False
     async def extract_text_from_pdf(self, file_path: str) -> OCRResponse:
-        """Extract text from PDF using PyMuPDF"""
         if not PDF_AVAILABLE:
-            return OCRResponse(
-                success=False,
-                text="",
-                method="error",
-                metadata={"error": "PDF processing not available"}
-            )
         try:
             doc = fitz.open(file_path)
             pages_text = []
             total_chars = 0
-            total_pages = doc.page_count  # Get total pages before closing
-            # Process up to 10 pages to avoid timeout
             for page_num in range(min(total_pages, 10)):
                 page = doc[page_num]
                 text = page.get_text()
                 pages_text.append(text)
                 total_chars += len(text)
             doc.close()
             full_text = "\n\n--- Page Break ---\n\n".join(pages_text)
             return OCRResponse(
                 success=True,
                 text=full_text,
@@ -119,27 +97,17 @@ class OCRService:
                     "file_size_kb": os.path.getsize(file_path) / 1024
                 }
             )
         except Exception as e:
             logger.error(f"PDF processing error: {e}")
-            return OCRResponse(
-                success=False,
-                text="",
-                method="error",
-                metadata={"error": str(e)}
-            )
     async def extract_text_from_image(self, file_path: str) -> OCRResponse:
-        """Extract text from image using TrOCR"""
         try:
             image = Image.open(file_path)
             if self.model_loaded and self.processor and self.model:
-                # Use TrOCR
                 pixel_values = self.processor(images=image, return_tensors="pt").pixel_values
                 generated_ids = self.model.generate(pixel_values)
                 generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
                 return OCRResponse(
                     success=True,
                     text=generated_text,
@@ -151,7 +119,6 @@ class OCRService:
                     }
                 )
             else:
-                # Fallback method
                 return OCRResponse(
                     success=True,
                     text=f"Image processed: {image.size} pixels, {image.mode} mode\nTrOCR model not loaded - text extraction limited",
@@ -162,20 +129,12 @@ class OCRService:
                         "note": "TrOCR model not available"
                     }
                 )
         except Exception as e:
             logger.error(f"Image processing error: {e}")
-            return OCRResponse(
-                success=False,
-                text="",
-                method="error",
-                metadata={"error": str(e)}
-            )
-# Initialize services
 ocr_service = OCRService()
-# Create FastAPI app
 app = FastAPI(
     title="Legal Dashboard API",
     description="Advanced Legal Document Processing System",
@@ -184,7 +143,6 @@ app = FastAPI(
     redoc_url="/api/redoc"
 )
-# Add CORS middleware
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -193,17 +151,15 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# Create directories
-os.makedirs("static", exist_ok=True)
-os.makedirs("temp", exist_ok=True)
-# Mount static files
-app.mount("/static", StaticFiles(directory="static"), name="static")
-# Startup event to load ML models
 @app.on_event("startup")
 async def startup_event():
-    """Load ML models on application startup"""
     if ML_AVAILABLE:
         try:
             logger.info("🚀 Loading OCR models on startup...")
@@ -211,58 +167,16 @@ async def startup_event():
         except Exception as e:
             logger.error(f"❌ Failed to load models on startup: {e}")
-# Routes
 @app.get("/", response_class=HTMLResponse)
 async def root():
-    """Serve main dashboard"""
-    try:
-        html_file = Path("static/index.html")
-        if html_file.exists():
-            return FileResponse(html_file)
-        else:
-            # Return inline HTML if file doesn't exist
-            return HTMLResponse("""
-            <!DOCTYPE html>
-            <html>
-            <head>
-                <title>Legal Dashboard</title>
-                <meta charset="UTF-8">
-                <meta name="viewport" content="width=device-width, initial-scale=1.0">
-                <style>
-                    body { font-family: Arial, sans-serif; margin: 0; padding: 20px; background: #f5f5f5; }
-                    .container { max-width: 800px; margin: 0 auto; background: white; padding: 30px; border-radius: 10px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); }
-                    .header { text-align: center; color: #333; margin-bottom: 30px; }
-                    .status { padding: 15px; background: #e8f5e8; border-left: 4px solid #4CAF50; margin: 20px 0; }
-                    .nav { display: flex; gap: 10px; margin: 20px 0; flex-wrap: wrap; }
-                    .nav a { padding: 10px 20px; background: #4CAF50; color: white; text-decoration: none; border-radius: 5px; }
-                    .nav a:hover { background: #45a049; }
-                </style>
-            </head>
-            <body>
-                <div class="container">
-                    <div class="header">
-                        <h1>🏛️ Legal Dashboard</h1>
-                        <p>Advanced Legal Document Processing System</p>
-                    </div>
-                    <div class="status">
-                        <strong>✅ System Status:</strong> FastAPI backend is running successfully!
-                    </div>
-                    <div class="nav">
-                        <a href="/api/docs">📚 API Documentation</a>
-                        <a href="/health">❤️ Health Check</a>
-                        <a href="/system/status">📊 System Status</a>
-                    </div>
-                    <p><strong>Note:</strong> Please create static/index.html for the full frontend interface.</p>
-                </div>
-            </body>
-            </html>
-            """)
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Error serving main page: {str(e)}")
 @app.get("/health")
 async def health_check():
-    """Health check endpoint"""
     return {
         "status": "healthy",
         "message": "Legal Dashboard is running",
@@ -276,7 +190,6 @@ async def health_check():
 @app.get("/system/status", response_model=SystemStatus)
 async def get_system_status():
-    """Get detailed system status"""
     return SystemStatus(
         status="healthy",
         services={
@@ -298,67 +211,38 @@ async def get_system_status():
 @app.post("/api/ocr/extract-pdf", response_model=OCRResponse)
 async def extract_pdf_text(file: UploadFile = File(...)):
-    """Extract text from PDF file"""
     if not file.filename.lower().endswith('.pdf'):
         raise HTTPException(status_code=400, detail="File must be a PDF")
     temp_path = None
     try:
-        # Save uploaded file temporarily
         with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
             content = await file.read()
             temp_file.write(content)
             temp_path = temp_file.name
-        # Process PDF
-        result = await ocr_service.extract_text_from_pdf(temp_path)
-        return result
-    except Exception as e:
-        logger.error(f"PDF extraction error: {e}")
-        raise HTTPException(status_code=500, detail=f"PDF processing failed: {str(e)}")
     finally:
-        # Clean up temp file
         if temp_path and os.path.exists(temp_path):
-            try:
-                os.unlink(temp_path)
-            except Exception as e:
-                logger.warning(f"Failed to cleanup temp file {temp_path}: {e}")
 @app.post("/api/ocr/extract-image", response_model=OCRResponse)
 async def extract_image_text(file: UploadFile = File(...)):
-    """Extract text from image file"""
     allowed_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff']
     if not any(file.filename.lower().endswith(ext) for ext in allowed_extensions):
         raise HTTPException(status_code=400, detail="File must be an image (JPG, PNG, BMP, TIFF)")
     temp_path = None
     try:
-        # Save uploaded file temporarily
         file_extension = Path(file.filename).suffix
         with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
             content = await file.read()
             temp_file.write(content)
             temp_path = temp_file.name
-        # Process image
-        result = await ocr_service.extract_text_from_image(temp_path)
-        return result
-    except Exception as e:
-        logger.error(f"Image extraction error: {e}")
-        raise HTTPException(status_code=500, detail=f"Image processing failed: {str(e)}")
     finally:
-        # Clean up temp file
         if temp_path and os.path.exists(temp_path):
-            try:
-                os.unlink(temp_path)
-            except Exception as e:
-                logger.warning(f"Failed to cleanup temp file {temp_path}: {e}")
 @app.get("/api/test")
 async def test_endpoint():
-    """Test endpoint for debugging"""
     return {
         "message": "API is working!",
         "pdf_available": PDF_AVAILABLE,
@@ -367,7 +251,6 @@ async def test_endpoint():
         "timestamp": datetime.now().isoformat()
     }
-# Error handlers
 @app.exception_handler(Exception)
 async def global_exception_handler(request: Request, exc: Exception):
     logger.error(f"Global exception: {exc}")
@@ -383,12 +266,4 @@ async def global_exception_handler(request: Request, exc: Exception):
 if __name__ == "__main__":
     import uvicorn
-    # Run on port 7860 for Hugging Face Spaces
-    uvicorn.run(
-        "main:app",
-        host="0.0.0.0",
-        port=7860,
-        reload=False,
-        log_level="info"
-    )

 import logging
 import traceback
 from pathlib import Path
+from typing import Dict, Any
 import asyncio
 from datetime import datetime
 # Data models
 class OCRResponse(BaseModel):
     success: bool
     text: str
     method: str
         try:
             logger.info("Loading TrOCR model...")
             model_name = "microsoft/trocr-base-printed"
             self.processor = TrOCRProcessor.from_pretrained(model_name)
             self.model = VisionEncoderDecoderModel.from_pretrained(model_name)
             self.model_loaded = True
             logger.info("✅ TrOCR model loaded successfully")
         except Exception as e:
             logger.error(f"❌ Failed to load TrOCR model: {e}")
             self.model_loaded = False
     async def extract_text_from_pdf(self, file_path: str) -> OCRResponse:
         if not PDF_AVAILABLE:
+            return OCRResponse(success=False, text="", method="error",
+                               metadata={"error": "PDF processing not available"})
         try:
             doc = fitz.open(file_path)
             pages_text = []
             total_chars = 0
+            total_pages = doc.page_count
             for page_num in range(min(total_pages, 10)):
                 page = doc[page_num]
                 text = page.get_text()
                 pages_text.append(text)
                 total_chars += len(text)
             doc.close()
             full_text = "\n\n--- Page Break ---\n\n".join(pages_text)
             return OCRResponse(
                 success=True,
                 text=full_text,
                     "file_size_kb": os.path.getsize(file_path) / 1024
                 }
             )
         except Exception as e:
             logger.error(f"PDF processing error: {e}")
+            return OCRResponse(success=False, text="", method="error", metadata={"error": str(e)})
     async def extract_text_from_image(self, file_path: str) -> OCRResponse:
         try:
             image = Image.open(file_path)
             if self.model_loaded and self.processor and self.model:
                 pixel_values = self.processor(images=image, return_tensors="pt").pixel_values
                 generated_ids = self.model.generate(pixel_values)
                 generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
                 return OCRResponse(
                     success=True,
                     text=generated_text,
                     }
                 )
             else:
                 return OCRResponse(
                     success=True,
                     text=f"Image processed: {image.size} pixels, {image.mode} mode\nTrOCR model not loaded - text extraction limited",
                         "note": "TrOCR model not available"
                     }
                 )
         except Exception as e:
             logger.error(f"Image processing error: {e}")
+            return OCRResponse(success=False, text="", method="error", metadata={"error": str(e)})
 ocr_service = OCRService()
 app = FastAPI(
     title="Legal Dashboard API",
     description="Advanced Legal Document Processing System",
     redoc_url="/api/redoc"
 )
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_headers=["*"],
 )
+# Use frontend folder as static files
+frontend_dir = Path("frontend")
+if not frontend_dir.exists():
+    logger.warning("⚠️ Frontend directory not found. UI may not load correctly.")
+else:
+    app.mount("/static", StaticFiles(directory=frontend_dir), name="static")
 @app.on_event("startup")
 async def startup_event():
     if ML_AVAILABLE:
         try:
             logger.info("🚀 Loading OCR models on startup...")
         except Exception as e:
             logger.error(f"❌ Failed to load models on startup: {e}")
 @app.get("/", response_class=HTMLResponse)
 async def root():
+    """Serve main frontend file"""
+    html_file = Path("frontend/index.html")
+    if html_file.exists():
+        return FileResponse(html_file)
+    return HTMLResponse("<h1>⚠️ Frontend not found</h1><p>Please ensure frontend/index.html exists.</p>")
 @app.get("/health")
 async def health_check():
     return {
         "status": "healthy",
         "message": "Legal Dashboard is running",
 @app.get("/system/status", response_model=SystemStatus)
 async def get_system_status():
     return SystemStatus(
         status="healthy",
         services={
 @app.post("/api/ocr/extract-pdf", response_model=OCRResponse)
 async def extract_pdf_text(file: UploadFile = File(...)):
     if not file.filename.lower().endswith('.pdf'):
         raise HTTPException(status_code=400, detail="File must be a PDF")
     temp_path = None
     try:
         with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
             content = await file.read()
             temp_file.write(content)
             temp_path = temp_file.name
+        return await ocr_service.extract_text_from_pdf(temp_path)
     finally:
         if temp_path and os.path.exists(temp_path):
+            os.unlink(temp_path)
 @app.post("/api/ocr/extract-image", response_model=OCRResponse)
 async def extract_image_text(file: UploadFile = File(...)):
     allowed_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff']
     if not any(file.filename.lower().endswith(ext) for ext in allowed_extensions):
         raise HTTPException(status_code=400, detail="File must be an image (JPG, PNG, BMP, TIFF)")
     temp_path = None
     try:
         file_extension = Path(file.filename).suffix
         with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
             content = await file.read()
             temp_file.write(content)
             temp_path = temp_file.name
+        return await ocr_service.extract_text_from_image(temp_path)
     finally:
         if temp_path and os.path.exists(temp_path):
+            os.unlink(temp_path)
 @app.get("/api/test")
 async def test_endpoint():
     return {
         "message": "API is working!",
         "pdf_available": PDF_AVAILABLE,
         "timestamp": datetime.now().isoformat()
     }
 @app.exception_handler(Exception)
 async def global_exception_handler(request: Request, exc: Exception):
     logger.error(f"Global exception: {exc}")
 if __name__ == "__main__":
     import uvicorn
+    uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=False, log_level="info")