Really-amin's picture
Update huggingface_space/app.py
25da451 verified
raw
history blame
12.4 kB
import os
import tempfile
import logging
from pathlib import Path
from typing import Optional, Tuple
try:
import gradio as gr
GRADIO_AVAILABLE = True
except ImportError:
GRADIO_AVAILABLE = False
logging.warning("Gradio not available")
# Import our services
try:
from app.services.ocr_service import ocr_service
from app.services.database_service import DatabaseService
OCR_AVAILABLE = True
except ImportError:
OCR_AVAILABLE = False
logging.warning("OCR service not available")
logger = logging.getLogger(__name__)
class LegalDashboardGradio:
"""
Gradio interface for Legal Dashboard
"""
def __init__(self):
self.ocr_service = ocr_service if OCR_AVAILABLE else None
self.db_service = None
# Initialize database if available
try:
self.db_service = DatabaseService()
except Exception as e:
logger.warning(f"Database service not available: {e}")
async def process_document(self, file) -> Tuple[str, str, str]:
"""
Process uploaded document and extract text
"""
if not file:
return "❌ No file uploaded", "", ""
if not self.ocr_service:
return "❌ OCR service not available", "", ""
try:
# Get file path
file_path = file.name
file_extension = Path(file_path).suffix.lower()
# Process based on file type
if file_extension == '.pdf':
result = await self.ocr_service.extract_text_from_pdf(file_path)
elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff']:
result = await self.ocr_service.extract_text_from_image(file_path)
else:
return f"❌ Unsupported file type: {file_extension}", "", ""
if result["success"]:
# Process text with NLP if available
processed = await self.ocr_service.process_text(result["text"])
# Create status message
status = f"βœ… Successfully processed using {result['method']}"
# Create metadata info
metadata = f"""
**Processing Details:**
- Method: {result['method']}
- Character Count: {len(result['text'])}
- Pages: {len(result.get('pages', []))}
"""
if processed.get('entities'):
entities_info = "\n**Named Entities Found:**\n"
for ent in processed['entities'][:10]: # Show first 10 entities
entities_info += f"- {ent['text']} ({ent['label']})\n"
metadata += entities_info
return status, result["text"], metadata
else:
error_msg = result.get("metadata", {}).get("error", "Unknown error")
return f"❌ Processing failed: {error_msg}", "", ""
except Exception as e:
logger.error(f"Document processing error: {e}")
return f"❌ Error: {str(e)}", "", ""
def search_documents(self, query: str) -> str:
"""
Search in processed documents
"""
if not query.strip():
return "Please enter a search query"
if not self.db_service:
return "Database service not available"
try:
# This would search in the database
# For now, return a placeholder
return f"Search results for '{query}' would appear here.\n\nDatabase integration coming soon..."
except Exception as e:
return f"Search error: {str(e)}"
def get_system_status(self) -> str:
"""
Get system status information
"""
try:
status = []
# OCR Service Status
if self.ocr_service:
ocr_status = self.ocr_service.get_service_status()
status.append("πŸ” **OCR Service:**")
status.append(f" - Status: {'βœ… Ready' if ocr_status['fallback_ready'] else '❌ Not Ready'}")
status.append(f" - Transformers: {'βœ… Available' if ocr_status['transformers_ready'] else '❌ Not Available'}")
status.append(f" - spaCy: {'βœ… Available' if ocr_status['spacy_ready'] else '❌ Not Available'}")
status.append(f" - Models: {', '.join(ocr_status['models_loaded']) if ocr_status['models_loaded'] else 'None'}")
else:
status.append("πŸ” **OCR Service:** ❌ Not Available")
# Database Service Status
if self.db_service:
status.append("\nπŸ’Ύ **Database Service:** βœ… Available")
else:
status.append("\nπŸ’Ύ **Database Service:** ❌ Not Available")
# System Info
status.append(f"\nπŸ–₯️ **System Info:**")
status.append(f" - Python: Available")
status.append(f" - Gradio: {'βœ… Available' if GRADIO_AVAILABLE else '❌ Not Available'}")
return "\n".join(status)
except Exception as e:
return f"Error getting system status: {str(e)}"
def create_gradio_interface():
"""
Create and return the Gradio interface
"""
if not GRADIO_AVAILABLE:
return None
dashboard = LegalDashboardGradio()
# Custom CSS
css = """
.gradio-container {
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
}
.main-header {
text-align: center;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 20px;
border-radius: 10px;
margin-bottom: 20px;
}
.status-box {
background: #f8f9fa;
border-left: 4px solid #28a745;
padding: 15px;
border-radius: 5px;
}
"""
with gr.Blocks(css=css, title="Legal Dashboard", theme=gr.themes.Soft()) as iface:
# Header
gr.HTML("""
<div class="main-header">
<h1>πŸ›οΈ Legal Dashboard</h1>
<p>Advanced Legal Document Management System with AI-Powered OCR</p>
</div>
""")
with gr.Tab("πŸ“„ Document Processing"):
gr.Markdown("## Upload and Process Documents")
gr.Markdown("Upload PDF files or images to extract text using advanced OCR technology.")
with gr.Row():
with gr.Column(scale=1):
file_input = gr.File(
label="Upload Document",
file_types=[".pdf", ".jpg", ".jpeg", ".png", ".bmp", ".tiff"],
type="file"
)
process_btn = gr.Button("πŸ” Process Document", variant="primary", size="lg")
with gr.Column(scale=2):
status_output = gr.Textbox(
label="Processing Status",
placeholder="Upload a document and click 'Process Document' to begin...",
interactive=False
)
with gr.Row():
with gr.Column():
extracted_text = gr.Textbox(
label="Extracted Text",
placeholder="Processed text will appear here...",
lines=15,
max_lines=30,
interactive=False
)
with gr.Column():
metadata_output = gr.Textbox(
label="Processing Details",
placeholder="Processing metadata and analysis will appear here...",
lines=15,
max_lines=30,
interactive=False
)
# Connect the processing function
process_btn.click(
fn=dashboard.process_document,
inputs=[file_input],
outputs=[status_output, extracted_text, metadata_output]
)
with gr.Tab("πŸ” Search Documents"):
gr.Markdown("## Search Processed Documents")
gr.Markdown("Search through previously processed documents using keywords and phrases.")
with gr.Row():
search_input = gr.Textbox(
label="Search Query",
placeholder="Enter keywords to search...",
scale=3
)
search_btn = gr.Button("πŸ” Search", variant="primary", scale=1)
search_results = gr.Textbox(
label="Search Results",
placeholder="Search results will appear here...",
lines=10,
interactive=False
)
# Connect search function
search_btn.click(
fn=dashboard.search_documents,
inputs=[search_input],
outputs=[search_results]
)
with gr.Tab("βš™οΈ System Status"):
gr.Markdown("## System Status and Information")
status_btn = gr.Button("πŸ”„ Refresh Status", variant="secondary")
system_status = gr.Textbox(
label="System Status",
placeholder="Click 'Refresh Status' to check system health...",
lines=15,
interactive=False
)
# Connect status function
status_btn.click(
fn=dashboard.get_system_status,
outputs=[system_status]
)
# Auto-load status on interface start
iface.load(
fn=dashboard.get_system_status,
outputs=[system_status]
)
with gr.Tab("πŸ“š About"):
gr.Markdown("""
## Legal Dashboard
**Advanced Legal Document Management System**
### Features:
- πŸ“„ **PDF Processing**: Extract text from PDF documents with high accuracy
- πŸ–ΌοΈ **Image OCR**: Process scanned documents and images
- 🧠 **AI-Powered**: Uses advanced transformer models for text recognition
- πŸ” **Smart Search**: Intelligent search capabilities across documents
- πŸ“Š **Analytics**: Document analysis and metadata extraction
- πŸ”’ **Secure**: Privacy-focused document processing
### Supported Formats:
- **Documents**: PDF
- **Images**: JPG, JPEG, PNG, BMP, TIFF
### Technology Stack:
- **OCR**: PyMuPDF, OpenCV, Transformers (TrOCR)
- **NLP**: spaCy for named entity recognition
- **ML**: PyTorch, Hugging Face Transformers
- **Interface**: Gradio for web interface
### Usage:
1. Upload your document using the **Document Processing** tab
2. Click **Process Document** to extract text
3. Use the **Search** tab to find specific content
4. Check **System Status** for service health
---
*This system is designed for legal professionals to efficiently process and manage legal documents with the power of AI.*
""")
return iface
def launch_gradio_app():
"""
Launch the Gradio application
"""
if not GRADIO_AVAILABLE:
print("❌ Gradio not available. Please install gradio: pip install gradio")
return None
iface = create_gradio_interface()
if iface:
print("πŸš€ Starting Legal Dashboard Gradio Interface...")
iface.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
show_error=True,
show_tips=True,
enable_queue=True
)
return iface
if __name__ == "__main__":
launch_gradio_app()