Spaces:
Paused
Paused
| """ | |
| Hugging Face Space Entry Point | |
| ============================== | |
| Gradio interface for the Legal Dashboard OCR system. | |
| """ | |
| from app.services.ai_service import AIScoringEngine | |
| from app.services.database_service import DatabaseManager | |
| from app.services.ocr_service import OCRPipeline | |
| import gradio as gr | |
| import os | |
| import tempfile | |
| import logging | |
| from pathlib import Path | |
| import sys | |
| # Add the app directory to Python path | |
| sys.path.append(str(Path(__file__).parent.parent)) | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # Initialize services | |
| ocr_pipeline = OCRPipeline() | |
| db_manager = DatabaseManager() | |
| ai_engine = AIScoringEngine() | |
| def process_pdf(file): | |
| """Process uploaded PDF file""" | |
| try: | |
| if file is None: | |
| return "❌ Please upload a PDF file", "", "", "" | |
| # Get file path | |
| file_path = file.name | |
| # Process with OCR | |
| result = ocr_pipeline.extract_text_from_pdf(file_path) | |
| if not result.get('success', False): | |
| error_msg = result.get('error_message', 'Unknown error') | |
| return f"❌ OCR processing failed: {error_msg}", "", "", "" | |
| # Extract text | |
| extracted_text = result.get('extracted_text', '') | |
| confidence = result.get('confidence', 0.0) | |
| processing_time = result.get('processing_time', 0.0) | |
| page_count = result.get('page_count', 0) | |
| # Calculate AI score | |
| document_data = { | |
| 'title': os.path.basename(file_path), | |
| 'full_text': extracted_text, | |
| 'source': 'Uploaded via HF Space', | |
| 'ocr_confidence': confidence | |
| } | |
| final_score = ai_engine.calculate_score(document_data) | |
| category = ai_engine.predict_category( | |
| document_data['title'], extracted_text) | |
| keywords = ai_engine.extract_keywords(extracted_text) | |
| # Prepare results | |
| score_info = f"AI Score: {final_score:.2f}/100\nCategory: {category}\nKeywords: {', '.join(keywords[:5])}" | |
| ocr_info = f"Confidence: {confidence:.2f}\nProcessing Time: {processing_time:.2f}s\nPages: {page_count}" | |
| return "✅ PDF processed successfully!", extracted_text, score_info, ocr_info | |
| except Exception as e: | |
| logger.error(f"Error processing PDF: {e}") | |
| return f"❌ Error: {str(e)}", "", "", "" | |
| def save_document(file, title, source, category): | |
| """Process and save document to database""" | |
| try: | |
| if file is None: | |
| return "❌ Please upload a PDF file" | |
| # Process PDF | |
| result = process_pdf(file) | |
| if result[0].startswith("❌"): | |
| return result[0] | |
| # Prepare document data | |
| document_data = { | |
| 'title': title or os.path.basename(file.name), | |
| 'source': source or 'HF Space Upload', | |
| 'category': category or 'عمومی', | |
| 'full_text': result[1], # extracted text | |
| 'ocr_confidence': float(result[3].split('\n')[0].split(': ')[1]), | |
| 'processing_time': float(result[3].split('\n')[1].split(': ')[1].replace('s', '')), | |
| 'final_score': float(result[2].split('\n')[0].split(': ')[1].split('/')[0]) | |
| } | |
| # Save to database | |
| document_id = db_manager.insert_document(document_data) | |
| return f"✅ Document saved successfully! ID: {document_id}" | |
| except Exception as e: | |
| logger.error(f"Error saving document: {e}") | |
| return f"❌ Error saving document: {str(e)}" | |
| def get_dashboard_stats(): | |
| """Get dashboard statistics""" | |
| try: | |
| summary = db_manager.get_dashboard_summary() | |
| stats_text = f""" | |
| 📊 Dashboard Statistics | |
| 📄 Total Documents: {summary['total_documents']} | |
| 📅 Processed Today: {summary['processed_today']} | |
| ⭐ Average Score: {summary['average_score']} | |
| 🏷️ Top Categories: | |
| """ | |
| for cat in summary['top_categories'][:5]: | |
| stats_text += f"• {cat['category']}: {cat['count']} documents\n" | |
| return stats_text | |
| except Exception as e: | |
| logger.error(f"Error getting dashboard stats: {e}") | |
| return f"❌ Error loading statistics: {str(e)}" | |
| # Create Gradio interface | |
| with gr.Blocks(title="Legal Dashboard OCR", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# 🏛️ Legal Dashboard OCR System") | |
| gr.Markdown( | |
| "AI-powered Persian legal document processing with OCR capabilities") | |
| with gr.Tabs(): | |
| # PDF Processing Tab | |
| with gr.Tab("📄 PDF Processing"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| file_input = gr.File( | |
| label="Upload PDF Document", file_types=[".pdf"]) | |
| process_btn = gr.Button("🔍 Process PDF", variant="primary") | |
| save_btn = gr.Button( | |
| "💾 Process & Save", variant="secondary") | |
| with gr.Column(): | |
| title_input = gr.Textbox(label="Document Title (optional)") | |
| source_input = gr.Textbox(label="Source (optional)") | |
| category_input = gr.Dropdown( | |
| choices=["عمومی", "قانون", "قضایی", | |
| "کیفری", "مدنی", "اداری", "تجاری"], | |
| label="Category (optional)", | |
| value="عمومی" | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| status_output = gr.Textbox( | |
| label="Status", interactive=False) | |
| extracted_text = gr.Textbox( | |
| label="Extracted Text", | |
| lines=10, | |
| max_lines=20, | |
| interactive=False | |
| ) | |
| with gr.Column(): | |
| score_info = gr.Textbox( | |
| label="AI Analysis", lines=5, interactive=False) | |
| ocr_info = gr.Textbox( | |
| label="OCR Information", lines=5, interactive=False) | |
| # Dashboard Tab | |
| with gr.Tab("📊 Dashboard"): | |
| refresh_btn = gr.Button("🔄 Refresh Statistics", variant="primary") | |
| stats_output = gr.Textbox( | |
| label="Dashboard Statistics", lines=15, interactive=False) | |
| # About Tab | |
| with gr.Tab("ℹ️ About"): | |
| gr.Markdown(""" | |
| ## Legal Dashboard OCR System | |
| This system provides advanced OCR capabilities for Persian legal documents using Hugging Face models. | |
| ### Features: | |
| - 📄 PDF text extraction with OCR | |
| - 🤖 AI-powered document scoring | |
| - 🏷️ Automatic category prediction | |
| - 📊 Dashboard analytics | |
| - 💾 Document storage and management | |
| ### OCR Models: | |
| - Microsoft TrOCR for printed text | |
| - Support for Persian/Farsi documents | |
| - Intelligent content detection | |
| ### AI Scoring: | |
| - Keyword relevance analysis | |
| - Document completeness assessment | |
| - Source credibility evaluation | |
| - Quality metrics calculation | |
| ### Usage: | |
| 1. Upload a PDF document | |
| 2. Click "Process PDF" to extract text | |
| 3. Review AI analysis and OCR information | |
| 4. Optionally save to database | |
| 5. View dashboard statistics | |
| """) | |
| # Event handlers | |
| process_btn.click( | |
| fn=process_pdf, | |
| inputs=[file_input], | |
| outputs=[status_output, extracted_text, score_info, ocr_info] | |
| ) | |
| save_btn.click( | |
| fn=save_document, | |
| inputs=[file_input, title_input, source_input, category_input], | |
| outputs=[status_output] | |
| ) | |
| refresh_btn.click( | |
| fn=get_dashboard_stats, | |
| inputs=[], | |
| outputs=[stats_output] | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False | |
| ) | |