Spaces:

Really-amin
/

Hoghoghi

Paused

App Files Files Community

Hoghoghi / huggingface_space /app.py

Really-amin

Upload 46 files

922c3ba verified 4 months ago

raw

history blame

8.38 kB

	"""
	Hugging Face Space Entry Point
	==============================

	Gradio interface for the Legal Dashboard OCR system.
	"""

	from app.services.ai_service import AIScoringEngine
	from app.services.database_service import DatabaseManager
	from app.services.ocr_service import OCRPipeline
	import gradio as gr
	import os
	import tempfile
	import logging
	from pathlib import Path
	import sys

	# Add the app directory to Python path
	sys.path.append(str(Path(__file__).parent.parent))


	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Initialize services
	ocr_pipeline = OCRPipeline()
	db_manager = DatabaseManager()
	ai_engine = AIScoringEngine()


	def process_pdf(file):
	"""Process uploaded PDF file"""
	try:
	if file is None:
	return "❌ Please upload a PDF file", "", "", ""

	# Get file path
	file_path = file.name

	# Process with OCR
	result = ocr_pipeline.extract_text_from_pdf(file_path)

	if not result.get('success', False):
	error_msg = result.get('error_message', 'Unknown error')
	return f"❌ OCR processing failed: {error_msg}", "", "", ""

	# Extract text
	extracted_text = result.get('extracted_text', '')
	confidence = result.get('confidence', 0.0)
	processing_time = result.get('processing_time', 0.0)
	page_count = result.get('page_count', 0)

	# Calculate AI score
	document_data = {
	'title': os.path.basename(file_path),
	'full_text': extracted_text,
	'source': 'Uploaded via HF Space',
	'ocr_confidence': confidence
	}

	final_score = ai_engine.calculate_score(document_data)
	category = ai_engine.predict_category(
	document_data['title'], extracted_text)
	keywords = ai_engine.extract_keywords(extracted_text)

	# Prepare results
	score_info = f"AI Score: {final_score:.2f}/100\nCategory: {category}\nKeywords: {', '.join(keywords[:5])}"
	ocr_info = f"Confidence: {confidence:.2f}\nProcessing Time: {processing_time:.2f}s\nPages: {page_count}"

	return "✅ PDF processed successfully!", extracted_text, score_info, ocr_info

	except Exception as e:
	logger.error(f"Error processing PDF: {e}")
	return f"❌ Error: {str(e)}", "", "", ""


	def save_document(file, title, source, category):
	"""Process and save document to database"""
	try:
	if file is None:
	return "❌ Please upload a PDF file"

	# Process PDF
	result = process_pdf(file)
	if result[0].startswith("❌"):
	return result[0]

	# Prepare document data
	document_data = {
	'title': title or os.path.basename(file.name),
	'source': source or 'HF Space Upload',
	'category': category or 'عمومی',
	'full_text': result[1], # extracted text
	'ocr_confidence': float(result[3].split('\n')[0].split(': ')[1]),
	'processing_time': float(result[3].split('\n')[1].split(': ')[1].replace('s', '')),
	'final_score': float(result[2].split('\n')[0].split(': ')[1].split('/')[0])
	}

	# Save to database
	document_id = db_manager.insert_document(document_data)

	return f"✅ Document saved successfully! ID: {document_id}"

	except Exception as e:
	logger.error(f"Error saving document: {e}")
	return f"❌ Error saving document: {str(e)}"


	def get_dashboard_stats():
	"""Get dashboard statistics"""
	try:
	summary = db_manager.get_dashboard_summary()

	stats_text = f"""
	📊 Dashboard Statistics

	📄 Total Documents: {summary['total_documents']}
	📅 Processed Today: {summary['processed_today']}
	⭐ Average Score: {summary['average_score']}

	🏷️ Top Categories:
	"""

	for cat in summary['top_categories'][:5]:
	stats_text += f"• {cat['category']}: {cat['count']} documents\n"

	return stats_text

	except Exception as e:
	logger.error(f"Error getting dashboard stats: {e}")
	return f"❌ Error loading statistics: {str(e)}"


	# Create Gradio interface
	with gr.Blocks(title="Legal Dashboard OCR", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🏛️ Legal Dashboard OCR System")
	gr.Markdown(
	"AI-powered Persian legal document processing with OCR capabilities")

	with gr.Tabs():
	# PDF Processing Tab
	with gr.Tab("📄 PDF Processing"):
	with gr.Row():
	with gr.Column():
	file_input = gr.File(
	label="Upload PDF Document", file_types=[".pdf"])
	process_btn = gr.Button("🔍 Process PDF", variant="primary")
	save_btn = gr.Button(
	"💾 Process & Save", variant="secondary")

	with gr.Column():
	title_input = gr.Textbox(label="Document Title (optional)")
	source_input = gr.Textbox(label="Source (optional)")
	category_input = gr.Dropdown(
	choices=["عمومی", "قانون", "قضایی",
	"کیفری", "مدنی", "اداری", "تجاری"],
	label="Category (optional)",
	value="عمومی"
	)

	with gr.Row():
	with gr.Column():
	status_output = gr.Textbox(
	label="Status", interactive=False)
	extracted_text = gr.Textbox(
	label="Extracted Text",
	lines=10,
	max_lines=20,
	interactive=False
	)

	with gr.Column():
	score_info = gr.Textbox(
	label="AI Analysis", lines=5, interactive=False)
	ocr_info = gr.Textbox(
	label="OCR Information", lines=5, interactive=False)

	# Dashboard Tab
	with gr.Tab("📊 Dashboard"):
	refresh_btn = gr.Button("🔄 Refresh Statistics", variant="primary")
	stats_output = gr.Textbox(
	label="Dashboard Statistics", lines=15, interactive=False)

	# About Tab
	with gr.Tab("ℹ️ About"):
	gr.Markdown("""
	## Legal Dashboard OCR System

	This system provides advanced OCR capabilities for Persian legal documents using Hugging Face models.

	### Features:
	- 📄 PDF text extraction with OCR
	- 🤖 AI-powered document scoring
	- 🏷️ Automatic category prediction
	- 📊 Dashboard analytics
	- 💾 Document storage and management

	### OCR Models:
	- Microsoft TrOCR for printed text
	- Support for Persian/Farsi documents
	- Intelligent content detection

	### AI Scoring:
	- Keyword relevance analysis
	- Document completeness assessment
	- Source credibility evaluation
	- Quality metrics calculation

	### Usage:
	1. Upload a PDF document
	2. Click "Process PDF" to extract text
	3. Review AI analysis and OCR information
	4. Optionally save to database
	5. View dashboard statistics
	""")

	# Event handlers
	process_btn.click(
	fn=process_pdf,
	inputs=[file_input],
	outputs=[status_output, extracted_text, score_info, ocr_info]
	)

	save_btn.click(
	fn=save_document,
	inputs=[file_input, title_input, source_input, category_input],
	outputs=[status_output]
	)

	refresh_btn.click(
	fn=get_dashboard_stats,
	inputs=[],
	outputs=[stats_output]
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False
	)