Spaces:
Paused
Paused
import os | |
import tempfile | |
import logging | |
from pathlib import Path | |
from typing import Optional, Tuple | |
try: | |
import gradio as gr | |
GRADIO_AVAILABLE = True | |
except ImportError: | |
GRADIO_AVAILABLE = False | |
logging.warning("Gradio not available") | |
# Import our services | |
try: | |
from app.services.ocr_service import ocr_service | |
from app.services.database_service import DatabaseService | |
OCR_AVAILABLE = True | |
except ImportError: | |
OCR_AVAILABLE = False | |
logging.warning("OCR service not available") | |
logger = logging.getLogger(__name__) | |
class LegalDashboardGradio: | |
""" | |
Gradio interface for Legal Dashboard | |
""" | |
def __init__(self): | |
self.ocr_service = ocr_service if OCR_AVAILABLE else None | |
self.db_service = None | |
# Initialize database if available | |
try: | |
self.db_service = DatabaseService() | |
except Exception as e: | |
logger.warning(f"Database service not available: {e}") | |
async def process_document(self, file) -> Tuple[str, str, str]: | |
""" | |
Process uploaded document and extract text | |
""" | |
if not file: | |
return "β No file uploaded", "", "" | |
if not self.ocr_service: | |
return "β OCR service not available", "", "" | |
try: | |
# Get file path | |
file_path = file.name | |
file_extension = Path(file_path).suffix.lower() | |
# Process based on file type | |
if file_extension == '.pdf': | |
result = await self.ocr_service.extract_text_from_pdf(file_path) | |
elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff']: | |
result = await self.ocr_service.extract_text_from_image(file_path) | |
else: | |
return f"β Unsupported file type: {file_extension}", "", "" | |
if result["success"]: | |
# Process text with NLP if available | |
processed = await self.ocr_service.process_text(result["text"]) | |
# Create status message | |
status = f"β Successfully processed using {result['method']}" | |
# Create metadata info | |
metadata = f""" | |
**Processing Details:** | |
- Method: {result['method']} | |
- Character Count: {len(result['text'])} | |
- Pages: {len(result.get('pages', []))} | |
""" | |
if processed.get('entities'): | |
entities_info = "\n**Named Entities Found:**\n" | |
for ent in processed['entities'][:10]: # Show first 10 entities | |
entities_info += f"- {ent['text']} ({ent['label']})\n" | |
metadata += entities_info | |
return status, result["text"], metadata | |
else: | |
error_msg = result.get("metadata", {}).get("error", "Unknown error") | |
return f"β Processing failed: {error_msg}", "", "" | |
except Exception as e: | |
logger.error(f"Document processing error: {e}") | |
return f"β Error: {str(e)}", "", "" | |
def search_documents(self, query: str) -> str: | |
""" | |
Search in processed documents | |
""" | |
if not query.strip(): | |
return "Please enter a search query" | |
if not self.db_service: | |
return "Database service not available" | |
try: | |
# This would search in the database | |
# For now, return a placeholder | |
return f"Search results for '{query}' would appear here.\n\nDatabase integration coming soon..." | |
except Exception as e: | |
return f"Search error: {str(e)}" | |
def get_system_status(self) -> str: | |
""" | |
Get system status information | |
""" | |
try: | |
status = [] | |
# OCR Service Status | |
if self.ocr_service: | |
ocr_status = self.ocr_service.get_service_status() | |
status.append("π **OCR Service:**") | |
status.append(f" - Status: {'β Ready' if ocr_status['fallback_ready'] else 'β Not Ready'}") | |
status.append(f" - Transformers: {'β Available' if ocr_status['transformers_ready'] else 'β Not Available'}") | |
status.append(f" - spaCy: {'β Available' if ocr_status['spacy_ready'] else 'β Not Available'}") | |
status.append(f" - Models: {', '.join(ocr_status['models_loaded']) if ocr_status['models_loaded'] else 'None'}") | |
else: | |
status.append("π **OCR Service:** β Not Available") | |
# Database Service Status | |
if self.db_service: | |
status.append("\nπΎ **Database Service:** β Available") | |
else: | |
status.append("\nπΎ **Database Service:** β Not Available") | |
# System Info | |
status.append(f"\nπ₯οΈ **System Info:**") | |
status.append(f" - Python: Available") | |
status.append(f" - Gradio: {'β Available' if GRADIO_AVAILABLE else 'β Not Available'}") | |
return "\n".join(status) | |
except Exception as e: | |
return f"Error getting system status: {str(e)}" | |
def create_gradio_interface(): | |
""" | |
Create and return the Gradio interface | |
""" | |
if not GRADIO_AVAILABLE: | |
return None | |
dashboard = LegalDashboardGradio() | |
# Custom CSS | |
css = """ | |
.gradio-container { | |
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; | |
} | |
.main-header { | |
text-align: center; | |
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
color: white; | |
padding: 20px; | |
border-radius: 10px; | |
margin-bottom: 20px; | |
} | |
.status-box { | |
background: #f8f9fa; | |
border-left: 4px solid #28a745; | |
padding: 15px; | |
border-radius: 5px; | |
} | |
""" | |
with gr.Blocks(css=css, title="Legal Dashboard", theme=gr.themes.Soft()) as iface: | |
# Header | |
gr.HTML(""" | |
<div class="main-header"> | |
<h1>ποΈ Legal Dashboard</h1> | |
<p>Advanced Legal Document Management System with AI-Powered OCR</p> | |
</div> | |
""") | |
with gr.Tab("π Document Processing"): | |
gr.Markdown("## Upload and Process Documents") | |
gr.Markdown("Upload PDF files or images to extract text using advanced OCR technology.") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
file_input = gr.File( | |
label="Upload Document", | |
file_types=[".pdf", ".jpg", ".jpeg", ".png", ".bmp", ".tiff"], | |
type="file" | |
) | |
process_btn = gr.Button("π Process Document", variant="primary", size="lg") | |
with gr.Column(scale=2): | |
status_output = gr.Textbox( | |
label="Processing Status", | |
placeholder="Upload a document and click 'Process Document' to begin...", | |
interactive=False | |
) | |
with gr.Row(): | |
with gr.Column(): | |
extracted_text = gr.Textbox( | |
label="Extracted Text", | |
placeholder="Processed text will appear here...", | |
lines=15, | |
max_lines=30, | |
interactive=False | |
) | |
with gr.Column(): | |
metadata_output = gr.Textbox( | |
label="Processing Details", | |
placeholder="Processing metadata and analysis will appear here...", | |
lines=15, | |
max_lines=30, | |
interactive=False | |
) | |
# Connect the processing function | |
process_btn.click( | |
fn=dashboard.process_document, | |
inputs=[file_input], | |
outputs=[status_output, extracted_text, metadata_output] | |
) | |
with gr.Tab("π Search Documents"): | |
gr.Markdown("## Search Processed Documents") | |
gr.Markdown("Search through previously processed documents using keywords and phrases.") | |
with gr.Row(): | |
search_input = gr.Textbox( | |
label="Search Query", | |
placeholder="Enter keywords to search...", | |
scale=3 | |
) | |
search_btn = gr.Button("π Search", variant="primary", scale=1) | |
search_results = gr.Textbox( | |
label="Search Results", | |
placeholder="Search results will appear here...", | |
lines=10, | |
interactive=False | |
) | |
# Connect search function | |
search_btn.click( | |
fn=dashboard.search_documents, | |
inputs=[search_input], | |
outputs=[search_results] | |
) | |
with gr.Tab("βοΈ System Status"): | |
gr.Markdown("## System Status and Information") | |
status_btn = gr.Button("π Refresh Status", variant="secondary") | |
system_status = gr.Textbox( | |
label="System Status", | |
placeholder="Click 'Refresh Status' to check system health...", | |
lines=15, | |
interactive=False | |
) | |
# Connect status function | |
status_btn.click( | |
fn=dashboard.get_system_status, | |
outputs=[system_status] | |
) | |
# Auto-load status on interface start | |
iface.load( | |
fn=dashboard.get_system_status, | |
outputs=[system_status] | |
) | |
with gr.Tab("π About"): | |
gr.Markdown(""" | |
## Legal Dashboard | |
**Advanced Legal Document Management System** | |
### Features: | |
- π **PDF Processing**: Extract text from PDF documents with high accuracy | |
- πΌοΈ **Image OCR**: Process scanned documents and images | |
- π§ **AI-Powered**: Uses advanced transformer models for text recognition | |
- π **Smart Search**: Intelligent search capabilities across documents | |
- π **Analytics**: Document analysis and metadata extraction | |
- π **Secure**: Privacy-focused document processing | |
### Supported Formats: | |
- **Documents**: PDF | |
- **Images**: JPG, JPEG, PNG, BMP, TIFF | |
### Technology Stack: | |
- **OCR**: PyMuPDF, OpenCV, Transformers (TrOCR) | |
- **NLP**: spaCy for named entity recognition | |
- **ML**: PyTorch, Hugging Face Transformers | |
- **Interface**: Gradio for web interface | |
### Usage: | |
1. Upload your document using the **Document Processing** tab | |
2. Click **Process Document** to extract text | |
3. Use the **Search** tab to find specific content | |
4. Check **System Status** for service health | |
--- | |
*This system is designed for legal professionals to efficiently process and manage legal documents with the power of AI.* | |
""") | |
return iface | |
def launch_gradio_app(): | |
""" | |
Launch the Gradio application | |
""" | |
if not GRADIO_AVAILABLE: | |
print("β Gradio not available. Please install gradio: pip install gradio") | |
return None | |
iface = create_gradio_interface() | |
if iface: | |
print("π Starting Legal Dashboard Gradio Interface...") | |
iface.launch( | |
server_name="0.0.0.0", | |
server_port=7860, | |
share=False, | |
show_error=True, | |
show_tips=True, | |
enable_queue=True | |
) | |
return iface | |
if __name__ == "__main__": | |
launch_gradio_app() |