Really-amin's picture
Upload 46 files
922c3ba verified
raw
history blame
8.38 kB
"""
Hugging Face Space Entry Point
==============================
Gradio interface for the Legal Dashboard OCR system.
"""
from app.services.ai_service import AIScoringEngine
from app.services.database_service import DatabaseManager
from app.services.ocr_service import OCRPipeline
import gradio as gr
import os
import tempfile
import logging
from pathlib import Path
import sys
# Add the app directory to Python path
sys.path.append(str(Path(__file__).parent.parent))
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Initialize services
ocr_pipeline = OCRPipeline()
db_manager = DatabaseManager()
ai_engine = AIScoringEngine()
def process_pdf(file):
"""Process uploaded PDF file"""
try:
if file is None:
return "❌ Please upload a PDF file", "", "", ""
# Get file path
file_path = file.name
# Process with OCR
result = ocr_pipeline.extract_text_from_pdf(file_path)
if not result.get('success', False):
error_msg = result.get('error_message', 'Unknown error')
return f"❌ OCR processing failed: {error_msg}", "", "", ""
# Extract text
extracted_text = result.get('extracted_text', '')
confidence = result.get('confidence', 0.0)
processing_time = result.get('processing_time', 0.0)
page_count = result.get('page_count', 0)
# Calculate AI score
document_data = {
'title': os.path.basename(file_path),
'full_text': extracted_text,
'source': 'Uploaded via HF Space',
'ocr_confidence': confidence
}
final_score = ai_engine.calculate_score(document_data)
category = ai_engine.predict_category(
document_data['title'], extracted_text)
keywords = ai_engine.extract_keywords(extracted_text)
# Prepare results
score_info = f"AI Score: {final_score:.2f}/100\nCategory: {category}\nKeywords: {', '.join(keywords[:5])}"
ocr_info = f"Confidence: {confidence:.2f}\nProcessing Time: {processing_time:.2f}s\nPages: {page_count}"
return "✅ PDF processed successfully!", extracted_text, score_info, ocr_info
except Exception as e:
logger.error(f"Error processing PDF: {e}")
return f"❌ Error: {str(e)}", "", "", ""
def save_document(file, title, source, category):
"""Process and save document to database"""
try:
if file is None:
return "❌ Please upload a PDF file"
# Process PDF
result = process_pdf(file)
if result[0].startswith("❌"):
return result[0]
# Prepare document data
document_data = {
'title': title or os.path.basename(file.name),
'source': source or 'HF Space Upload',
'category': category or 'عمومی',
'full_text': result[1], # extracted text
'ocr_confidence': float(result[3].split('\n')[0].split(': ')[1]),
'processing_time': float(result[3].split('\n')[1].split(': ')[1].replace('s', '')),
'final_score': float(result[2].split('\n')[0].split(': ')[1].split('/')[0])
}
# Save to database
document_id = db_manager.insert_document(document_data)
return f"✅ Document saved successfully! ID: {document_id}"
except Exception as e:
logger.error(f"Error saving document: {e}")
return f"❌ Error saving document: {str(e)}"
def get_dashboard_stats():
"""Get dashboard statistics"""
try:
summary = db_manager.get_dashboard_summary()
stats_text = f"""
📊 Dashboard Statistics
📄 Total Documents: {summary['total_documents']}
📅 Processed Today: {summary['processed_today']}
⭐ Average Score: {summary['average_score']}
🏷️ Top Categories:
"""
for cat in summary['top_categories'][:5]:
stats_text += f"• {cat['category']}: {cat['count']} documents\n"
return stats_text
except Exception as e:
logger.error(f"Error getting dashboard stats: {e}")
return f"❌ Error loading statistics: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="Legal Dashboard OCR", theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🏛️ Legal Dashboard OCR System")
gr.Markdown(
"AI-powered Persian legal document processing with OCR capabilities")
with gr.Tabs():
# PDF Processing Tab
with gr.Tab("📄 PDF Processing"):
with gr.Row():
with gr.Column():
file_input = gr.File(
label="Upload PDF Document", file_types=[".pdf"])
process_btn = gr.Button("🔍 Process PDF", variant="primary")
save_btn = gr.Button(
"💾 Process & Save", variant="secondary")
with gr.Column():
title_input = gr.Textbox(label="Document Title (optional)")
source_input = gr.Textbox(label="Source (optional)")
category_input = gr.Dropdown(
choices=["عمومی", "قانون", "قضایی",
"کیفری", "مدنی", "اداری", "تجاری"],
label="Category (optional)",
value="عمومی"
)
with gr.Row():
with gr.Column():
status_output = gr.Textbox(
label="Status", interactive=False)
extracted_text = gr.Textbox(
label="Extracted Text",
lines=10,
max_lines=20,
interactive=False
)
with gr.Column():
score_info = gr.Textbox(
label="AI Analysis", lines=5, interactive=False)
ocr_info = gr.Textbox(
label="OCR Information", lines=5, interactive=False)
# Dashboard Tab
with gr.Tab("📊 Dashboard"):
refresh_btn = gr.Button("🔄 Refresh Statistics", variant="primary")
stats_output = gr.Textbox(
label="Dashboard Statistics", lines=15, interactive=False)
# About Tab
with gr.Tab("ℹ️ About"):
gr.Markdown("""
## Legal Dashboard OCR System
This system provides advanced OCR capabilities for Persian legal documents using Hugging Face models.
### Features:
- 📄 PDF text extraction with OCR
- 🤖 AI-powered document scoring
- 🏷️ Automatic category prediction
- 📊 Dashboard analytics
- 💾 Document storage and management
### OCR Models:
- Microsoft TrOCR for printed text
- Support for Persian/Farsi documents
- Intelligent content detection
### AI Scoring:
- Keyword relevance analysis
- Document completeness assessment
- Source credibility evaluation
- Quality metrics calculation
### Usage:
1. Upload a PDF document
2. Click "Process PDF" to extract text
3. Review AI analysis and OCR information
4. Optionally save to database
5. View dashboard statistics
""")
# Event handlers
process_btn.click(
fn=process_pdf,
inputs=[file_input],
outputs=[status_output, extracted_text, score_info, ocr_info]
)
save_btn.click(
fn=save_document,
inputs=[file_input, title_input, source_input, category_input],
outputs=[status_output]
)
refresh_btn.click(
fn=get_dashboard_stats,
inputs=[],
outputs=[stats_output]
)
# Launch the app
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False
)