Spaces:
Running
Running
""" | |
ماژول یکپارچه سیستم OCR پارسی | |
این کد شامل تنظیمات، ابزارهای کمکی، پردازش اسناد و تصاویر، و رابط کاربری Gradio با بهبودهای جدید است. | |
طراحی شده برای اجرا روی CPU بدون نیاز به GPU. | |
""" | |
import os | |
import re | |
import gc | |
import time | |
import json | |
import uuid | |
import logging | |
import hashlib | |
import traceback | |
import threading | |
import subprocess | |
import contextlib | |
import signal | |
import sys | |
from datetime import datetime, timedelta | |
from typing import List, Dict, Tuple, Any, Optional | |
from concurrent.futures import ThreadPoolExecutor, as_completed | |
from functools import lru_cache, wraps | |
from difflib import SequenceMatcher | |
from collections import deque | |
from dataclasses import dataclass, field | |
import cv2 | |
import numpy as np | |
import fitz # PyMuPDF | |
import pytesseract | |
import requests | |
from pdf2image import convert_from_path | |
from PIL import Image, ImageEnhance, ImageFilter | |
import gradio as gr | |
import multiprocessing | |
import psutil | |
import platform | |
import socket | |
import shutil | |
import glob | |
from pathlib import Path | |
from colorama import Fore, Style, init | |
# تنظیمات colorama | |
init() | |
# تنظیمات پیکربندی | |
CONFIG = { | |
"TESSDATA_LOCAL": os.environ.get("TESSDATA_PATH", "tessdata"), | |
"TESSERACT_CMD": os.environ.get("TESSERACT_CMD", "/usr/bin/tesseract"), | |
"CACHE_DIR": os.environ.get("CACHE_DIR", "cache"), | |
"MODEL_DIR": os.environ.get("MODEL_DIR", "models"), | |
"LOG_DIR": os.environ.get("LOG_DIR", "logs"), | |
"TEMP_DIR": os.environ.get("TEMP_DIR", "temp"), | |
"OUTPUT_DIR": os.environ.get("OUTPUT_DIR", "output"), | |
"DATASET_DIR": os.environ.get("DATASET_DIR", "datasets"), | |
"MAX_CACHE_SIZE_MB": int(os.environ.get("MAX_CACHE_SIZE_MB", "2048")), | |
"MAX_WORKERS": min(multiprocessing.cpu_count(), 1), | |
"MAX_MEMORY_PERCENT": float(os.environ.get("MAX_MEMORY_PERCENT", "99.5")), # آستانه حافظه | |
"MAX_CPU_PERCENT": float(os.environ.get("MAX_CPU_PERCENT", "80.0")), | |
"MIN_FREE_SPACE_BYTES": 1024**3, | |
"DEFAULT_LANGUAGE": os.environ.get("DEFAULT_LANGUAGE", "fas"), | |
"SUPPORTED_LANGUAGES": ["fas", "eng", "ara", "eng+fas", "fas+eng", "ara+fas"], | |
"CONFIDENCE_THRESHOLD": float(os.environ.get("CONFIDENCE_THRESHOLD", "0.65")), | |
"PAGE_SEGMENTATION_MODE": os.environ.get("PAGE_SEGMENTATION_MODE", "3"), | |
"OCR_ENGINE_MODE": os.environ.get("OCR_ENGINE_MODE", "1"), | |
"AZURE_API_KEY": os.environ.get("AZURE_API_KEY", ""), | |
"AZURE_ENDPOINT": os.environ.get("AZURE_ENDPOINT", ""), | |
"GOOGLE_APPLICATION_CREDENTIALS": os.environ.get("GOOGLE_APPLICATION_CREDENTIALS", ""), | |
"DEBUG_MODE": os.environ.get("DEBUG_MODE", "false").lower() == "true", | |
"VERSION": "2.0.1", | |
"UI_TITLE": "سیستم OCR پارسی" | |
} | |
# تنظیم لاگها با رنگیسازی | |
logging.basicConfig(level=logging.INFO, format=f"{Fore.GREEN}%(asctime)s - %(name)s - %(levelname)s - {Fore.RESET}%(message)s") | |
def log_with_check(message, success=True): | |
"""لاگ با تیک یا علامت خطا و جداسازی با خط چین""" | |
separator = f"{Fore.YELLOW}---{Style.RESET_ALL}" | |
check = f"{Fore.GREEN}✅{Style.RESET_ALL}" if success else f"{Fore.RED}❌{Style.RESET_ALL}" | |
logging.info(f"{separator}\n{check} {message}\n{separator}") | |
def init_directories(): | |
"""ایجاد دایرکتوریهای مورد نیاز""" | |
directories = [ | |
CONFIG["CACHE_DIR"], CONFIG["MODEL_DIR"], CONFIG["LOG_DIR"], | |
CONFIG["TEMP_DIR"], CONFIG["OUTPUT_DIR"], CONFIG["DATASET_DIR"], | |
CONFIG["TESSDATA_LOCAL"] | |
] | |
for directory in directories: | |
os.makedirs(directory, exist_ok=True) | |
log_with_check("Directories initialized") | |
def check_optional_libs(): | |
"""بررسی دسترسی به کتابخانههای اختیاری""" | |
CONFIG["AZURE_OCR_AVAILABLE"] = False | |
CONFIG["GOOGLE_OCR_AVAILABLE"] = False | |
try: | |
from azure.cognitiveservices.vision.computervision import ComputerVisionClient | |
CONFIG["AZURE_OCR_AVAILABLE"] = True | |
except ImportError: | |
pass | |
try: | |
from google.cloud import vision | |
CONFIG["GOOGLE_OCR_AVAILABLE"] = True | |
except ImportError: | |
pass | |
log_with_check("Optional libraries checked") | |
# دکوراتورها | |
def timed(func): | |
def wrapper(*args, **kwargs): | |
start_time = time.time() | |
result = func(*args, **kwargs) | |
logging.debug(f"{func.__name__} took {time.time() - start_time:.2f} seconds") | |
return result | |
return wrapper | |
def temp_file(suffix=None): | |
"""مدیریت فایلهای موقت""" | |
temp_path = os.path.join(CONFIG["TEMP_DIR"], f"temp_{uuid.uuid4()}{suffix or ''}") | |
try: | |
yield temp_path | |
finally: | |
if os.path.exists(temp_path): | |
try: | |
os.remove(temp_path) | |
except: | |
pass | |
class Utils: | |
def detect_language(text): | |
if any('\u0600' <= c <= '\u06FF' for c in text): | |
return 'fa' | |
return 'en' | |
def is_rtl_language(lang_code): | |
return lang_code in ['fa', 'fas', 'ar', 'ara'] | |
def get_file_extension(file_path): | |
return os.path.splitext(file_path)[1].lower() | |
def get_human_readable_size(size, decimal_places=2): | |
for unit in ['B', 'KB', 'MB', 'GB', 'TB']: | |
if size < 1024.0: | |
break | |
size /= 1024.0 | |
return f"{size:.{decimal_places}f} {unit}" | |
def retry(max_attempts=3, delay=1, exceptions=(Exception,)): | |
def decorator(func): | |
def wrapper(*args, **kwargs): | |
for attempt in range(max_attempts): | |
try: | |
return func(*args, **kwargs) | |
except exceptions as e: | |
if attempt == max_attempts - 1: | |
raise | |
logging.warning(f"Retry {attempt+1}/{max_attempts} for {func.__name__} after error: {str(e)}") | |
time.sleep(delay) | |
return wrapper | |
return decorator | |
class ResourceUsage: | |
cpu_percent: float | |
memory_percent: float | |
gpu_memory_percent: Optional[float] = None | |
timestamp: datetime = None | |
class LoadPrediction: | |
expected_load: float | |
confidence: float | |
time_window: timedelta | |
details: Dict | |
class ResourceMetrics: | |
memory_usage: float | |
cpu_usage: float | |
disk_usage: float | |
timestamp: datetime | |
class ScanRegion: | |
content: str | |
confidence: float | |
language: str = 'unknown' | |
direction: str = 'ltr' | |
class DocumentStructure: | |
title: str | |
language: str | |
page_count: int | |
toc: Dict | |
sections: List[Dict] | |
metadata: Dict | |
attributes: Dict | |
class ProcessingConfig: | |
use_preprocessing: bool = True | |
use_caching: bool = True | |
max_workers: int = CONFIG["MAX_WORKERS"] | |
language: str = CONFIG["DEFAULT_LANGUAGE"] | |
confidence_threshold: float = CONFIG["CONFIDENCE_THRESHOLD"] | |
class CacheManager: | |
def __init__(self, cache_dir=CONFIG["CACHE_DIR"], max_cache_size_mb=CONFIG["MAX_CACHE_SIZE_MB"]): | |
self.cache_dir = cache_dir | |
self.max_cache_size_mb = max_cache_size_mb | |
self.cache_entries = {} | |
self.access_history = deque(maxlen=1000) | |
self.logger = logging.getLogger(__name__) | |
self.lock = threading.Lock() | |
os.makedirs(self.cache_dir, exist_ok=True) | |
self._load_index() | |
log_with_check("CacheManager initialized") | |
def _hash_key(self, key: str) -> str: | |
return hashlib.md5(key.encode('utf-8')).hexdigest() | |
def _cache_path(self, key_hash: str) -> str: | |
return os.path.join(self.cache_dir, f"{key_hash}.cache") | |
def _load_index(self): | |
index_path = os.path.join(self.cache_dir, "index.json") | |
try: | |
if os.path.exists(index_path): | |
with open(index_path, 'r', encoding='utf-8') as f: | |
cache_data = json.load(f) | |
self.cache_entries = cache_data.get('entries', {}) | |
log_with_check(f"Loaded {len(self.cache_entries)} cache entries") | |
except Exception as e: | |
self.logger.error(f"Error loading cache index: {str(e)}") | |
log_with_check("Failed to load cache index", False) | |
def _save_index(self): | |
index_path = os.path.join(self.cache_dir, "index.json") | |
try: | |
with open(index_path, 'w', encoding='utf-8') as f: | |
json.dump({'entries': self.cache_entries}, f) | |
log_with_check("Cache index saved") | |
except Exception as e: | |
self.logger.error(f"Error saving cache index: {str(e)}") | |
log_with_check("Failed to save cache index", False) | |
def get(self, key: str) -> Any: | |
with self.lock: | |
key_hash = self._hash_key(key) | |
if key_hash not in self.cache_entries: | |
return None | |
cache_path = self._cache_path(key_hash) | |
if not os.path.exists(cache_path): | |
del self.cache_entries[key_hash] | |
self._save_index() | |
return None | |
try: | |
with open(cache_path, 'rb') as f: | |
import pickle | |
result = pickle.load(f) | |
self.cache_entries[key_hash]['last_access'] = datetime.now().isoformat() | |
self.access_history.append(key_hash) | |
return result | |
except Exception as e: | |
self.logger.error(f"Error retrieving from cache: {str(e)}") | |
return None | |
def set(self, key: str, value: Any, expire_seconds: int = 86400) -> bool: | |
with self.lock: | |
if self._check_cache_size() > self.max_cache_size_mb: | |
self._clean_cache() | |
key_hash = self._hash_key(key) | |
cache_path = self._cache_path(key_hash) | |
try: | |
with open(cache_path, 'wb') as f: | |
import pickle | |
pickle.dump(value, f) | |
now = datetime.now() | |
self.cache_entries[key_hash] = { | |
'key': key, | |
'created': now.isoformat(), | |
'last_access': now.isoformat(), | |
'expires': (now + timedelta(seconds=expire_seconds)).isoformat(), | |
'size': os.path.getsize(cache_path) | |
} | |
self.access_history.append(key_hash) | |
self._save_index() | |
return True | |
except Exception as e: | |
self.logger.error(f"Error storing in cache: {str(e)}") | |
return False | |
def _check_cache_size(self) -> float: | |
total_size = sum(entry.get('size', 0) for entry in self.cache_entries.values()) | |
return total_size / (1024 * 1024) | |
def _clean_cache(self): | |
with self.lock: | |
now = datetime.now() | |
expired_keys = [k for k, v in self.cache_entries.items() if datetime.fromisoformat(v['expires']) < now] | |
for key in expired_keys: | |
self._remove_item(key) | |
if self._check_cache_size() > self.max_cache_size_mb * 0.8: | |
access_counts = {key: self.access_history.count(key) for key in set(self.access_history)} | |
to_remove = sorted( | |
[k for k in self.cache_entries.keys() if k not in expired_keys], | |
key=lambda k: access_counts.get(k, 0) | |
) | |
for key in to_remove: | |
self._remove_item(key) | |
if self._check_cache_size() < self.max_cache_size_mb * 0.7: | |
break | |
def _remove_item(self, key_hash: str): | |
try: | |
cache_path = self._cache_path(key_hash) | |
if os.path.exists(cache_path): | |
os.remove(cache_path) | |
if key_hash in self.cache_entries: | |
del self.cache_entries[key_hash] | |
except Exception as e: | |
self.logger.error(f"Error removing cache item: {str(e)}") | |
def clear(self): | |
with self.lock: | |
for key_hash in list(self.cache_entries.keys()): | |
self._remove_item(key_hash) | |
self.cache_entries = {} | |
self.access_history.clear() | |
self._save_index() | |
def generate_key(self, image: np.ndarray) -> str: | |
small_img = cv2.resize(image, (32, 32)) | |
if len(small_img.shape) == 3: | |
small_img = cv2.cvtColor(small_img, cv2.COLOR_BGR2GRAY) | |
img_hash = hashlib.md5(small_img.tobytes()).hexdigest() | |
return f"img:{img_hash}" | |
class Normalizer: | |
def __init__(self): | |
self.logger = logging.getLogger(__name__) | |
log_with_check("Normalizer initialized") | |
self.char_mappings = { | |
'ك': 'ک', 'ي': 'ی', 'أ': 'ا', 'إ': 'ا', 'آ': 'ا', 'ة': 'ه', | |
'٠': '0', '١': '1', '٢': '2', '٣': '3', '٤': '4', | |
'٥': '5', '٦': '6', '٧': '7', '٨': '8', '٩': '9', | |
'۰': '0', '۱': '1', '۲': '2', '۳': '3', '۴': '4', | |
'۵': '5', '۶': '6', '۷': '7', '۸': '8', '۹': '9' | |
} | |
self.space_patterns = [ | |
(r'\s+', ' '), (r'ـ+', ''), (r'[.]{2,}', '...'), | |
(r'[\u200c\u200f\u200e]+', '\u200c') | |
] | |
def normalize(self, text: str, normalize_chars: bool = True, normalize_spaces: bool = True) -> str: | |
if not text: | |
return text | |
result = text | |
if normalize_chars: | |
for src, dst in self.char_mappings.items(): | |
result = result.replace(src, dst) | |
if normalize_spaces: | |
for pattern, replacement in self.space_patterns: | |
result = re.sub(pattern, replacement, result) | |
return result.strip() | |
class AdaptiveLearner: | |
def __init__(self, model_dir: str = CONFIG["MODEL_DIR"]): | |
os.makedirs(model_dir, exist_ok=True) | |
self.model_dir = model_dir | |
self.corrections = {} | |
self.confidence_threshold = CONFIG["CONFIDENCE_THRESHOLD"] | |
self.logger = logging.getLogger(__name__) | |
self.lock = threading.Lock() | |
self._load_corrections() | |
log_with_check("AdaptiveLearner initialized") | |
def _load_corrections(self): | |
corrections_path = os.path.join(self.model_dir, "corrections.json") | |
try: | |
if os.path.exists(corrections_path): | |
with open(corrections_path, 'r', encoding='utf-8') as f: | |
self.corrections = json.load(f) | |
log_with_check(f"Loaded {sum(len(v) for k, v in self.corrections.items() if isinstance(v, dict))} text corrections") | |
else: | |
log_with_check("No corrections file found", False) | |
except Exception as e: | |
self.logger.error(f"Error loading corrections: {str(e)}") | |
log_with_check("Failed to load corrections", False) | |
self.corrections = {} | |
def _save_corrections(self): | |
with self.lock: | |
corrections_path = os.path.join(self.model_dir, "corrections.json") | |
try: | |
with open(corrections_path, 'w', encoding='utf-8') as f: | |
json.dump(self.corrections, f, ensure_ascii=False, indent=2) | |
log_with_check("Corrections saved") | |
except Exception as e: | |
self.logger.error(f"Error saving corrections: {str(e)}") | |
log_with_check("Failed to save corrections", False) | |
def apply_corrections(self, text: str, context: Dict) -> Tuple[str, float]: | |
if not text: | |
return text, 1.0 | |
language = context.get('language', 'unknown') | |
corrected_text = text | |
confidence = 1.0 | |
for pattern, replacement in self.corrections.get('general', {}).items(): | |
corrected_text = re.sub(pattern, replacement, corrected_text) | |
for pattern, replacement in self.corrections.get(language, {}).items(): | |
corrected_text = re.sub(pattern, replacement, corrected_text) | |
if text != corrected_text: | |
similarity = SequenceMatcher(None, text, corrected_text).ratio() | |
confidence = similarity | |
return corrected_text, confidence | |
def learn_correction(self, original: str, corrected: str, context: Dict): | |
if original == corrected: | |
return | |
with self.lock: | |
language = context.get('language', 'general') | |
if language not in self.corrections: | |
self.corrections[language] = {} | |
if len(original) > 10: | |
matcher = SequenceMatcher(None, original, corrected) | |
for tag, i1, i2, j1, j2 in matcher.get_opcodes(): | |
if tag == 'replace': | |
pattern = re.escape(original[i1:i2]) | |
replacement = corrected[j1:j2] | |
self.corrections[language][pattern] = replacement | |
else: | |
self.corrections[language][re.escape(original)] = corrected | |
self._save_corrections() | |
class ImageProcessor: | |
def __init__(self): | |
self.logger = logging.getLogger(__name__) | |
log_with_check("ImageProcessor initialized") | |
def deskew(image: np.ndarray) -> np.ndarray: | |
try: | |
if len(image.shape) == 3: | |
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) | |
else: | |
gray = image.copy() | |
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] | |
coords = np.column_stack(np.where(thresh > 0)) | |
angle = cv2.minAreaRect(coords)[-1] | |
if angle < -45: | |
angle = -(90 + angle) | |
else: | |
angle = -angle | |
(h, w) = image.shape[:2] | |
center = (w // 2, h // 2) | |
matrix = cv2.getRotationMatrix2D(center, angle, 1.0) | |
rotated = cv2.warpAffine(image, matrix, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE) | |
return rotated | |
except Exception as e: | |
logging.error(f"Deskew error: {str(e)}") | |
return image | |
def remove_noise(image: np.ndarray) -> np.ndarray: | |
try: | |
if len(image.shape) == 3: | |
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) | |
else: | |
gray = image.copy() | |
return cv2.medianBlur(gray, 3) | |
except Exception as e: | |
logging.error(f"Noise removal error: {str(e)}") | |
return image | |
def adjust_contrast(image: np.ndarray, factor=1.5) -> np.ndarray: | |
try: | |
if len(image.shape) == 3: | |
pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) | |
else: | |
pil_image = Image.fromarray(image) | |
enhancer = ImageEnhance.Contrast(pil_image) | |
enhanced_img = enhancer.enhance(factor) | |
if len(image.shape) == 3: | |
return cv2.cvtColor(np.array(enhanced_img), cv2.COLOR_RGB2BGR) | |
return np.array(enhanced_img) | |
except Exception as e: | |
logging.error(f"Contrast adjustment error: {str(e)}") | |
return image | |
def sharpen(image: np.ndarray) -> np.ndarray: | |
try: | |
if len(image.shape) == 3: | |
pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) | |
else: | |
pil_image = Image.fromarray(image) | |
sharpened = pil_image.filter(ImageFilter.SHARPEN) | |
if len(image.shape) == 3: | |
return cv2.cvtColor(np.array(sharpened), cv2.COLOR_RGB2BGR) | |
return np.array(sharpened) | |
except Exception as e: | |
logging.error(f"Sharpen error: {str(e)}") | |
return image | |
def binarize(image: np.ndarray, method="adaptive") -> np.ndarray: | |
try: | |
if len(image.shape) == 3: | |
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) | |
else: | |
gray = image.copy() | |
if method == "otsu": | |
return cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1] | |
return cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2) | |
except Exception as e: | |
logging.error(f"Binarization error: {str(e)}") | |
return image | |
def enhance_for_ocr(self, image: np.ndarray) -> np.ndarray: | |
try: | |
enhanced = image.copy() | |
enhanced = self.deskew(enhanced) | |
enhanced = self.remove_noise(enhanced) | |
enhanced = self.adjust_contrast(enhanced, 1.5) | |
enhanced = self.sharpen(enhanced) | |
enhanced = self.binarize(enhanced, "adaptive") | |
return enhanced | |
except Exception as e: | |
self.logger.error(f"Image enhancement error: {str(e)}") | |
return image | |
class CloudOCRProvider: | |
def __init__(self): | |
self.logger = logging.getLogger(__name__) | |
log_with_check("CloudOCRProvider initialized") | |
self.azure_client = None | |
self.google_client = None | |
if CONFIG.get("AZURE_OCR_AVAILABLE", False) and CONFIG["AZURE_API_KEY"] and CONFIG["AZURE_ENDPOINT"]: | |
try: | |
from azure.cognitiveservices.vision.computervision import ComputerVisionClient | |
from msrest.authentication import CognitiveServicesCredentials | |
self.azure_client = ComputerVisionClient(CONFIG["AZURE_ENDPOINT"], CognitiveServicesCredentials(CONFIG["AZURE_API_KEY"])) | |
log_with_check("Azure OCR initialized") | |
except Exception as e: | |
self.logger.error(f"Azure OCR initialization error: {str(e)}") | |
log_with_check("Azure OCR initialization failed", False) | |
if CONFIG.get("GOOGLE_OCR_AVAILABLE", False) and CONFIG["GOOGLE_APPLICATION_CREDENTIALS"]: | |
try: | |
from google.cloud import vision | |
self.google_client = vision.ImageAnnotatorClient() | |
log_with_check("Google OCR initialized") | |
except Exception as e: | |
self.logger.error(f"Google OCR initialization error: {str(e)}") | |
log_with_check("Google OCR initialization failed", False) | |
def is_available(self): | |
return self.azure_client is not None or self.google_client is not None | |
def process_with_azure(self, image_path): | |
if not self.azure_client: | |
return None | |
try: | |
with open(image_path, "rb") as image_file: | |
image_data = image_file.read() | |
from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes | |
recognize_results = self.azure_client.recognize_printed_text_in_stream(image_data) | |
result = {"regions": [], "text": "", "language": "", "confidence": 0.0} | |
all_text = [] | |
total_confidence = 0.0 | |
count = 0 | |
for region in recognize_results.regions: | |
region_text = [] | |
region_info = {"bounding_box": region.bounding_box.split(","), "lines": []} | |
for line in region.lines: | |
line_text = [] | |
line_info = {"bounding_box": line.bounding_box.split(","), "words": []} | |
for word in line.words: | |
line_text.append(word.text) | |
total_confidence += word.confidence | |
count += 1 | |
line_info["words"].append({"text": word.text, "confidence": word.confidence, "bounding_box": word.bounding_box.split(",")}) | |
full_line = " ".join(line_text) | |
region_text.append(full_line) | |
line_info["text"] = full_line | |
region_info["lines"].append(line_info) | |
full_region = "\n".join(region_text) | |
all_text.append(full_region) | |
region_info["text"] = full_region | |
result["regions"].append(region_info) | |
result["text"] = "\n\n".join(all_text) | |
if count > 0: | |
result["confidence"] = total_confidence / count | |
result["language"] = recognize_results.language or "unknown" | |
return result | |
except Exception as e: | |
self.logger.error(f"Azure OCR error: {str(e)}") | |
return None | |
def process_with_google(self, image_path): | |
if not self.google_client: | |
return None | |
try: | |
with open(image_path, "rb") as image_file: | |
content = image_file.read() | |
from google.cloud import vision | |
image = vision.Image(content=content) | |
response = self.google_client.text_detection(image=image) | |
if response.error.message: | |
self.logger.error(f"Google OCR API error: {response.error.message}") | |
return None | |
result = {"regions": [], "text": "", "language": "", "confidence": 0.0} | |
full_text_annotation = response.full_text_annotation | |
if full_text_annotation: | |
result["text"] = full_text_annotation.text | |
for page in full_text_annotation.pages: | |
for block in page.blocks: | |
block_info = {"bounding_box": [[vertex.x, vertex.y] for vertex in block.bounding_box.vertices], "paragraphs": [], "text": ""} | |
block_texts = [] | |
for paragraph in block.paragraphs: | |
para_info = {"bounding_box": [[vertex.x, vertex.y] for vertex in paragraph.bounding_box.vertices], "words": [], "text": ""} | |
para_texts = [] | |
for word in paragraph.words: | |
word_text = "".join([symbol.text for symbol in word.symbols]) | |
para_texts.append(word_text) | |
word_info = {"text": word_text, "bounding_box": [[vertex.x, vertex.y] for vertex in word.bounding_box.vertices], "confidence": word.confidence} | |
para_info["words"].append(word_info) | |
para_info["text"] = " ".join(para_texts) | |
block_texts.append(para_info["text"]) | |
block_info["paragraphs"].append(para_info) | |
block_info["text"] = "\n".join(block_texts) | |
result["regions"].append(block_info) | |
result["confidence"] = full_text_annotation.pages[0].confidence if full_text_annotation.pages else 0.0 | |
if result["text"]: | |
result["language"] = Utils.detect_language(result["text"]) | |
return result | |
except Exception as e: | |
self.logger.error(f"Google OCR error: {str(e)}") | |
return None | |
def process_image(self, image_path, prefer_provider=None): | |
providers = [] | |
if prefer_provider: | |
if prefer_provider == "azure" and self.azure_client: | |
providers = ["azure", "google", "local"] | |
elif prefer_provider == "google" and self.google_client: | |
providers = ["google", "azure", "local"] | |
else: | |
providers = ["local", "azure", "google"] | |
else: | |
if self.azure_client: | |
providers.append("azure") | |
if self.google_client: | |
providers.append("google") | |
providers.append("local") | |
for provider in providers: | |
try: | |
if provider == "azure": | |
result = self.process_with_azure(image_path) | |
if result: | |
result["provider"] = "azure" | |
return result | |
elif provider == "google": | |
result = self.process_with_google(image_path) | |
if result: | |
result["provider"] = "google" | |
return result | |
elif provider == "local": | |
img = cv2.imread(image_path) | |
if img is None: | |
continue | |
enhanced = self.enhance_for_ocr(img) | |
text = pytesseract.image_to_string(enhanced, lang=CONFIG["DEFAULT_LANGUAGE"], | |
config=f'--oem {CONFIG["OCR_ENGINE_MODE"]} --psm {CONFIG["PAGE_SEGMENTATION_MODE"]}') | |
boxes = pytesseract.image_to_data(enhanced, lang=CONFIG["DEFAULT_LANGUAGE"], | |
config=f'--oem {CONFIG["OCR_ENGINE_MODE"]} --psm {CONFIG["PAGE_SEGMENTATION_MODE"]}', | |
output_type=pytesseract.Output.DICT) | |
result = {"text": text, "regions": [], "provider": "local", "language": CONFIG["DEFAULT_LANGUAGE"], "confidence": 0.0} | |
confidences = [] | |
current_block = -1 | |
current_region = {} | |
current_lines = [] | |
for i in range(len(boxes['text'])): | |
if boxes['text'][i].strip(): | |
confidences.append(boxes['conf'][i]) | |
if boxes['block_num'][i] != current_block: | |
if current_block != -1 and current_lines: | |
current_region["text"] = "\n".join(current_lines) | |
result["regions"].append(current_region) | |
current_block = boxes['block_num'][i] | |
current_region = {"bounding_box": [boxes['left'][i], boxes['top'][i], | |
boxes['left'][i] + boxes['width'][i], | |
boxes['top'][i] + boxes['height'][i]], "lines": []} | |
current_lines = [] | |
line_num = boxes['line_num'][i] | |
if line_num >= len(current_lines): | |
current_lines.append(boxes['text'][i]) | |
else: | |
current_lines[line_num] += " " + boxes['text'][i] | |
if current_block != -1 and current_lines: | |
current_region["text"] = "\n".join(current_lines) | |
result["regions"].append(current_region) | |
if confidences: | |
valid_confidences = [c for c in confidences if c > 0] | |
if valid_confidences: | |
result["confidence"] = sum(valid_confidences) / len(valid_confidences) / 100.0 | |
return result | |
except Exception as e: | |
self.logger.error(f"OCR with {provider} failed: {str(e)}") | |
continue | |
return None | |
class ResourceManager: | |
"""کلاس مدیریت منابع سیستمی""" | |
def __init__(self, max_memory_percent=CONFIG["MAX_MEMORY_PERCENT"], max_cpu_percent=CONFIG["MAX_CPU_PERCENT"], min_free_space_bytes=CONFIG["MIN_FREE_SPACE_BYTES"]): | |
self.logger = logging.getLogger(__name__) | |
self.max_memory_percent = max_memory_percent | |
self.max_cpu_percent = max_cpu_percent | |
self.min_free_space_bytes = min_free_space_bytes | |
self.metrics_history = deque(maxlen=1000) | |
self.last_cleanup = None | |
self.resource_warnings = 0 | |
self.is_cleaning = False | |
self.lock = threading.Lock() | |
self.stats = {'total_processed': 0, 'successful_extractions': 0, 'failed_extractions': 0} | |
log_with_check("ResourceManager initialized") | |
def get_current_metrics(self) -> ResourceMetrics: | |
try: | |
memory = psutil.virtual_memory() | |
cpu = psutil.cpu_percent(interval=0.5) | |
disk = psutil.disk_usage('/') | |
metrics = ResourceMetrics(memory.percent, cpu, disk.percent, datetime.now()) | |
with self.lock: | |
self.metrics_history.append(metrics) | |
return metrics | |
except Exception as e: | |
self.logger.error(f"Error getting metrics: {str(e)}") | |
return ResourceMetrics(0.0, 0.0, 0.0, datetime.now()) | |
def check_resources(self) -> bool: | |
with self.lock: | |
if self.is_cleaning: | |
return False | |
metrics = self.get_current_metrics() | |
needs_cleanup = ( | |
metrics.memory_usage > self.max_memory_percent or | |
metrics.cpu_usage > self.max_cpu_percent or | |
psutil.disk_usage('/').free < self.min_free_space_bytes | |
) | |
if needs_cleanup: | |
self.cleanup_resources() | |
return True | |
return False | |
def cleanup_resources(self): | |
if self.is_cleaning: | |
return | |
self.is_cleaning = True | |
self.logger.info("Starting resource cleanup") | |
try: | |
gc.collect() | |
self._cleanup_temp_files() | |
self.last_cleanup = datetime.now() | |
log_with_check("Resource cleanup completed") | |
except Exception as e: | |
self.logger.error(f"Error during cleanup: {str(e)}") | |
log_with_check("Resource cleanup failed", False) | |
finally: | |
self.is_cleaning = False | |
def _cleanup_temp_files(self): | |
try: | |
count = 0 | |
for filename in os.listdir(CONFIG["TEMP_DIR"]): | |
file_path = os.path.join(CONFIG["TEMP_DIR"], filename) | |
if os.path.isfile(file_path): | |
os.remove(file_path) | |
count += 1 | |
self.logger.info(f"Cleaned {count} temp files") | |
except Exception as e: | |
self.logger.error(f"Error cleaning temp files: {str(e)}") | |
class LoadPredictor: | |
def __init__(self, history_size: int = 1000, update_interval: int = 60): | |
self.logger = logging.getLogger(__name__) | |
self.history_size = history_size | |
self.update_interval = update_interval | |
self.resource_history = deque(maxlen=history_size) | |
self.current_predictions = {'short_term': None, 'medium_term': None, 'long_term': None} | |
self.thresholds = {'cpu_high': 80.0, 'memory_high': 99.5, 'prediction_confidence': 0.7} | |
self.monitoring = False | |
self.monitor_thread = None | |
self._start_monitoring() | |
log_with_check("LoadPredictor initialized") | |
def _start_monitoring(self): | |
self.monitoring = True | |
self.monitor_thread = threading.Thread(target=self._monitor_loop) | |
self.monitor_thread.daemon = True | |
self.monitor_thread.start() | |
log_with_check("Monitoring thread started") | |
def _monitor_loop(self): | |
while self.monitoring: | |
try: | |
usage = self._collect_resource_usage() | |
self.resource_history.append(usage) | |
self._update_predictions() | |
self._check_alerts() | |
time.sleep(self.update_interval) | |
except Exception as e: | |
self.logger.error(f"Error in monitor loop: {str(e)}") | |
log_with_check("Monitor loop error", False) | |
def _collect_resource_usage(self) -> ResourceUsage: | |
try: | |
cpu_percent = psutil.cpu_percent(interval=1) | |
memory_percent = psutil.virtual_memory().percent | |
return ResourceUsage(cpu_percent, memory_percent, None, datetime.now()) | |
except Exception as e: | |
self.logger.error(f"Error collecting resource usage: {str(e)}") | |
return ResourceUsage(0.0, 0.0, None, datetime.now()) | |
def _update_predictions(self): | |
try: | |
self.current_predictions['short_term'] = self._predict_load(timedelta(minutes=5)) | |
self.current_predictions['medium_term'] = self._predict_load(timedelta(hours=1)) | |
self.current_predictions['long_term'] = self._predict_load(timedelta(days=1)) | |
except Exception as e: | |
self.logger.error(f"Error updating predictions: {str(e)}") | |
def _predict_load(self, time_window: timedelta) -> LoadPrediction: | |
if len(self.resource_history) < 10: | |
return LoadPrediction(50.0, 0.5, time_window, {}) | |
try: | |
history_array = np.array([[usage.cpu_percent, usage.memory_percent] for usage in self.resource_history]) | |
x = np.arange(len(history_array)) | |
cpu_trend = np.polyfit(x, history_array[:, 0], 2) | |
memory_trend = np.polyfit(x, history_array[:, 1], 2) | |
future_point = len(history_array) + time_window.total_seconds() / self.update_interval | |
predicted_cpu = np.polyval(cpu_trend, future_point) | |
predicted_memory = np.polyval(memory_trend, future_point) | |
confidence = self._calculate_prediction_confidence(history_array, cpu_trend, memory_trend) | |
return LoadPrediction((predicted_cpu + predicted_memory) / 2, confidence, time_window, {'cpu': predicted_cpu, 'memory': predicted_memory}) | |
except Exception as e: | |
self.logger.error(f"Error in load prediction: {str(e)}") | |
return LoadPrediction(50.0, 0.3, time_window, {}) | |
def _calculate_prediction_confidence(self, history: np.ndarray, cpu_trend: np.ndarray, memory_trend: np.ndarray) -> float: | |
try: | |
x = np.arange(len(history)) | |
cpu_predictions = np.polyval(cpu_trend, x) | |
memory_predictions = np.polyval(memory_trend, x) | |
cpu_rmse = np.sqrt(np.mean((history[:, 0] - cpu_predictions) ** 2)) | |
memory_rmse = np.sqrt(np.mean((history[:, 1] - memory_predictions) ** 2)) | |
max_rmse = 50.0 | |
cpu_confidence = max(0.0, min(1.0, 1.0 - cpu_rmse / max_rmse)) | |
memory_confidence = max(0.0, min(1.0, 1.0 - memory_rmse / max_rmse)) | |
return 0.6 * cpu_confidence + 0.4 * memory_confidence | |
except Exception as e: | |
self.logger.error(f"Error calculating prediction confidence: {str(e)}") | |
return 0.5 | |
def _check_alerts(self): | |
current_usage = self.resource_history[-1] if self.resource_history else None | |
if not current_usage: | |
return | |
alerts = [] | |
if current_usage.cpu_percent > self.thresholds['cpu_high']: | |
alerts.append({'type': 'high_cpu', 'value': current_usage.cpu_percent}) | |
if current_usage.memory_percent > self.thresholds['memory_high']: | |
alerts.append({'type': 'high_memory', 'value': current_usage.memory_percent}) | |
if alerts: | |
self._handle_alerts(alerts) | |
def _handle_alerts(self, alerts: List[Dict]): | |
for alert in alerts: | |
self.logger.warning(f"هشدار: {alert['type']} در سطح {alert['value']}%") | |
def get_current_load(self) -> Dict: | |
current_usage = self.resource_history[-1] if self.resource_history else None | |
if not current_usage: | |
return {} | |
return { | |
'current': { | |
'cpu': current_usage.cpu_percent, | |
'memory': current_usage.memory_percent, | |
'timestamp': current_usage.timestamp.isoformat() | |
}, | |
'predictions': { | |
name: { | |
'load': pred.expected_load, | |
'confidence': pred.confidence, | |
'resources': pred.details | |
} if pred else None | |
for name, pred in self.current_predictions.items() | |
} | |
} | |
class DocumentProcessor: | |
def __init__(self, config=None): | |
self.config = config or ProcessingConfig() | |
self.logger = logging.getLogger(__name__) | |
self.cache = CacheManager() | |
self.normalizer = Normalizer() | |
self.learner = AdaptiveLearner(model_dir=CONFIG["MODEL_DIR"]) | |
self.img_processor = ImageProcessor() | |
self.cloud_ocr = CloudOCRProvider() | |
self.resource_manager = ResourceManager() | |
self.load_predictor = LoadPredictor() | |
self.executor = ThreadPoolExecutor(max_workers=self.config.max_workers) | |
init_directories() | |
self.setup_tesseract() | |
log_with_check(f"DocumentProcessor initialized (version {CONFIG['VERSION']})") | |
def setup_tesseract(self): | |
try: | |
tesseract_installed = False | |
try: | |
version = subprocess.check_output([CONFIG["TESSERACT_CMD"], '--version'], text=True) | |
self.logger.info(f"Tesseract found: {version.split()[0]}") | |
tesseract_installed = True | |
log_with_check("Tesseract found") | |
except subprocess.CalledProcessError: | |
self.logger.warning(f"Tesseract not found in {CONFIG['TESSERACT_CMD']}, attempting to download data and install via system commands") | |
log_with_check("Tesseract not found, attempting installation", False) | |
# روش اول: نصب از طریق subprocess.check_call | |
try: | |
subprocess.check_call(['apt-get', 'update', '-y', '--fix-missing', '--allow-downgrades']) | |
subprocess.check_call(['apt-get', 'install', 'tesseract-ocr', '-y', '--no-install-recommends', '--fix-missing', '--allow-downgrades']) | |
os.environ["TESSDATA_PREFIX"] = os.path.abspath(CONFIG["TESSDATA_LOCAL"]) | |
pytesseract.pytesseract.tesseract_cmd = CONFIG["TESSERACT_CMD"] | |
version = pytesseract.get_tesseract_version() | |
self.logger.info(f"Tesseract installed via Method 1, version: {version}") | |
tesseract_installed = True | |
log_with_check("Tesseract installed via Method 1") | |
except subprocess.CalledProcessError as e: | |
self.logger.warning(f"Method 1 failed: {str(e)}, attempting Method 2") | |
log_with_check("Tesseract installation Method 1 failed", False) | |
# روش دوم: استفاده از os.system | |
os.system('chmod 777 /tmp') | |
os.system('apt-get update -y --fix-missing --allow-downgrades') | |
os.system('apt-get install tesseract-ocr -y --no-install-recommends --fix-missing --allow-downgrades') | |
os.system('pip install -q pytesseract') | |
if __name__ == "__main__": | |
print("YES") | |
os.environ["TESSDATA_PREFIX"] = os.path.abspath(CONFIG["TESSDATA_LOCAL"]) | |
pytesseract.pytesseract.tesseract_cmd = CONFIG["TESSERACT_CMD"] | |
try: | |
version = pytesseract.get_tesseract_version() | |
self.logger.info(f"Tesseract installed via Method 2, version: {version}") | |
tesseract_installed = True | |
log_with_check("Tesseract installed via Method 2") | |
except Exception as e: | |
self.logger.error(f"Method 2 failed: {str(e)}. Continuing with downloaded data if available.") | |
log_with_check("Tesseract installation Method 2 failed", False) | |
if not tesseract_installed and os.path.exists(CONFIG["TESSDATA_LOCAL"]): | |
os.environ["TESSDATA_PREFIX"] = os.path.abspath(CONFIG["TESSDATA_LOCAL"]) | |
self.logger.warning("Using downloaded Tesseract data without executable") | |
log_with_check("Using downloaded Tesseract data", False) | |
self._download_tesseract_data() | |
try: | |
version = pytesseract.get_tesseract_version() | |
self.logger.info(f"Tesseract is ready, version: {version}") | |
log_with_check("Tesseract setup completed") | |
return True | |
except Exception as e: | |
self.logger.error(f"Tesseract setup failed: {str(e)}. Continuing with downloaded data if available.") | |
log_with_check("Tesseract setup failed", False) | |
return False | |
except Exception as e: | |
self.logger.error(f"Tesseract setup failed: {str(e)}") | |
log_with_check("Tesseract setup failed due to exception", False) | |
return False | |
def _download_tesseract_data(self): | |
try: | |
os.makedirs(CONFIG["TESSDATA_LOCAL"], exist_ok=True) | |
base_url = "https://github.com/tesseract-ocr/tessdata_best/raw/main/" | |
languages = ["fas", "eng", "ara"] | |
import tqdm | |
for lang in languages: | |
lang_file = os.path.join(CONFIG["TESSDATA_LOCAL"], f"{lang}.traineddata") | |
if not os.path.exists(lang_file): | |
self.logger.info(f"Downloading {lang}.traineddata...") | |
response = requests.get(f"{base_url}{lang}.traineddata", stream=True) | |
response.raise_for_status() | |
with open(lang_file, 'wb') as f: | |
total_size = int(response.headers.get('content-length', 0)) | |
with tqdm.tqdm(total=total_size, unit='B', unit_scale=True, desc=lang) as pbar: | |
for chunk in response.iter_content(chunk_size=8192): | |
if chunk: | |
f.write(chunk) | |
pbar.update(len(chunk)) | |
self.logger.info(f"Downloaded {lang}.traineddata successfully") | |
log_with_check(f"Downloaded {lang}.traineddata") | |
except Exception as e: | |
self.logger.error(f"Error downloading Tesseract data: {str(e)}") | |
log_with_check("Failed to download Tesseract data", False) | |
def process_pdf(self, pdf_path) -> Dict: | |
self.logger.info(f"Processing PDF: {pdf_path} - Start") | |
log_with_check(f"Starting PDF processing for {pdf_path}") | |
if self.resource_manager.check_resources(): | |
self.logger.warning("Resources checked and cleaned during PDF processing") | |
log_with_check("Resources checked and cleaned") | |
try: | |
doc = fitz.open(pdf_path) | |
result = { | |
"title": os.path.basename(pdf_path), | |
"page_count": len(doc), | |
"pages": [], | |
"metadata": self._extract_pdf_metadata(doc), | |
"toc": self._extract_toc(doc) | |
} | |
futures = [] | |
for page_num in range(len(doc)): | |
future = self.executor.submit(self._process_page, doc, page_num) | |
futures.append(future) | |
for idx, future in enumerate(as_completed(futures)): | |
page_result = future.result() | |
result["pages"].append(page_result) | |
self.logger.debug(f"Completed page {idx+1}/{len(doc)}") | |
log_with_check(f"Completed page {idx+1}/{len(doc)}") | |
gc.collect() # آزادسازی حافظه پس از هر صفحه | |
result["pages"].sort(key=lambda x: x["page_num"]) | |
result["structure"] = self._analyze_document_structure(result) | |
self.logger.info(f"PDF processing complete: {pdf_path} - End") | |
log_with_check(f"PDF processing completed for {pdf_path}") | |
self.resource_manager.stats['total_processed'] += 1 | |
self.resource_manager.stats['successful_extractions'] += 1 | |
gc.collect() # آزادسازی حافظه پس از پردازش کامل | |
return result | |
except Exception as e: | |
self.logger.error(f"Error processing PDF: {str(e)}") | |
log_with_check(f"Error processing PDF: {str(e)}", False) | |
self.resource_manager.stats['total_processed'] += 1 | |
self.resource_manager.stats['failed_extractions'] += 1 | |
return {"error": str(e), "traceback": traceback.format_exc()} | |
def _extract_pdf_metadata(self, doc) -> Dict: | |
metadata = {} | |
for key, value in doc.metadata.items(): | |
if value: | |
metadata[key] = value | |
return metadata | |
def _extract_toc(self, doc) -> List[Dict]: | |
toc = [] | |
try: | |
raw_toc = doc.get_toc() | |
for level, title, page in raw_toc: | |
toc.append({"level": level, "title": title, "page": page}) | |
except: | |
pass | |
return toc | |
def _process_page(self, doc, page_num: int) -> Dict: | |
self.logger.debug(f"Processing page {page_num+1} - Start") | |
log_with_check(f"Starting page {page_num+1} processing") | |
page = doc[page_num] | |
cache_key = f"page:{doc.name}:{page_num}:{hash(page.get_text())}" | |
cached_result = self.cache.get(cache_key) | |
if cached_result and self.config.use_caching: | |
self.logger.debug(f"Using cached result for page {page_num+1}") | |
log_with_check(f"Using cached result for page {page_num+1}") | |
return cached_result | |
result = {"page_num": page_num + 1, "width": page.rect.width, "height": page.rect.height, "text_regions": []} | |
image_list = [] | |
try: | |
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) | |
img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n) | |
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) | |
image_list.append(img) | |
if not image_list or img.size == 0: | |
with temp_file(".pdf") as temp_pdf: | |
with open(temp_pdf, "wb") as f: | |
doc.save(f) | |
page_images = convert_from_path(temp_pdf, first_page=page_num+1, last_page=page_num+1, dpi=75) | |
if page_images: | |
np_image = np.array(page_images[0]) | |
cv_image = cv2.cvtColor(np_image, cv2.COLOR_RGB2BGR) | |
image_list.append(cv_image) | |
except Exception as e: | |
self.logger.error(f"Error extracting page image: {str(e)}") | |
log_with_check(f"Error extracting page {page_num+1} image: {str(e)}", False) | |
for img in image_list: | |
regions = self._perform_ocr(img) | |
result["text_regions"].extend(regions) | |
raw_text = page.get_text() | |
if raw_text and raw_text.strip(): | |
result["text_direct"] = raw_text | |
if self.config.use_caching: | |
self.cache.set(cache_key, result) | |
self.logger.debug(f"Processing page {page_num+1} - End") | |
log_with_check(f"Completed page {page_num+1} processing") | |
gc.collect() # آزادسازی حافظه پس از پردازش هر صفحه | |
return result | |
def _perform_ocr(self, image: np.ndarray) -> List[Dict]: | |
cache_key = self.cache.generate_key(image) | |
cached_result = self.cache.get(cache_key) | |
if cached_result and self.config.use_caching: | |
return cached_result | |
regions = [] | |
processed_img = self.img_processor.enhance_for_ocr(image) if self.config.use_preprocessing else image.copy() | |
try: | |
cloud_result = None | |
if self.cloud_ocr.is_available() and self.config.use_distributed: | |
with temp_file(".png") as temp_img: | |
cv2.imwrite(temp_img, processed_img) | |
cloud_result = self.cloud_ocr.process_image(temp_img) | |
if cloud_result: | |
for region in cloud_result.get("regions", []): | |
region_text = region.get("text", "").strip() | |
if region_text: | |
normalized_text = self.normalizer.normalize(region_text) | |
corrected_text, corr_conf = self.learner.apply_corrections( | |
normalized_text, {"language": self.config.language} | |
) | |
regions.append({ | |
"content": corrected_text, | |
"confidence": cloud_result.get("confidence", 0.8) * corr_conf, | |
"language": cloud_result.get("language", self.config.language), | |
"direction": "rtl" if Utils.is_rtl_language(self.config.language) else "ltr", | |
"provider": cloud_result.get("provider", "cloud"), | |
"bounding_box": region.get("bounding_box") | |
}) | |
else: | |
ocr_result = pytesseract.image_to_data( | |
processed_img, | |
lang=self.config.language, | |
config=f'--oem {CONFIG["OCR_ENGINE_MODE"]} --psm {CONFIG["PAGE_SEGMENTATION_MODE"]}', | |
output_type=pytesseract.Output.DICT | |
) | |
current_block = -1 | |
current_text = [] | |
current_conf = [] | |
for i in range(len(ocr_result['text'])): | |
text = ocr_result['text'][i].strip() | |
conf = int(ocr_result['conf'][i]) | |
block_num = ocr_result['block_num'][i] | |
if not text: | |
continue | |
if block_num != current_block: | |
if current_block != -1 and current_text: | |
full_text = ' '.join(current_text) | |
avg_conf = sum(current_conf) / len(current_conf) if current_conf else 0 | |
normalized_text = self.normalizer.normalize(full_text) | |
corrected_text, corr_conf = self.learner.apply_corrections( | |
normalized_text, {"language": self.config.language} | |
) | |
regions.append({ | |
"content": corrected_text, | |
"confidence": min(avg_conf * corr_conf / 100, 100), | |
"language": self.config.language, | |
"direction": "rtl" if Utils.is_rtl_language(self.config.language) else "ltr", | |
"provider": "tesseract" | |
}) | |
current_block = block_num | |
current_text = [] | |
current_conf = [] | |
current_text.append(text) | |
current_conf.append(conf) | |
if current_text: | |
full_text = ' '.join(current_text) | |
avg_conf = sum(current_conf) / len(current_conf) if current_conf else 0 | |
normalized_text = self.normalizer.normalize(full_text) | |
corrected_text, corr_conf = self.learner.apply_corrections( | |
normalized_text, {"language": self.config.language} | |
) | |
regions.append({ | |
"content": corrected_text, | |
"confidence": min(avg_conf * corr_conf / 100, 100), | |
"language": self.config.language, | |
"direction": "rtl" if Utils.is_rtl_language(self.config.language) else "ltr", | |
"provider": "tesseract" | |
}) | |
except Exception as e: | |
self.logger.error(f"OCR error: {str(e)}") | |
log_with_check("OCR failed", False) | |
if self.config.use_caching: | |
self.cache.set(cache_key, regions) | |
return regions | |
def _analyze_document_structure(self, doc_data: Dict) -> Dict: | |
structure = { | |
"title": doc_data.get("title", ""), | |
"language": self.config.language, | |
"page_count": doc_data.get("page_count", 0), | |
"sections": [], | |
"summary": "" | |
} | |
all_text = [] | |
for page in doc_data.get("pages", []): | |
page_text = [] | |
for region in page.get("text_regions", []): | |
if region.get("confidence", 0) >= self.config.confidence_threshold: | |
page_text.append(region.get("content", "")) | |
if page_text: | |
all_text.append(" ".join(page_text)) | |
toc = doc_data.get("toc", []) | |
if toc: | |
for item in toc: | |
section = {"title": item.get("title", ""), "level": item.get("level", 1), "page": item.get("page", 1)} | |
structure["sections"].append(section) | |
if all_text: | |
combined_text = "\n".join(all_text) | |
structure["summary"] = combined_text[:500] + "..." if len(combined_text) > 500 else combined_text | |
return structure | |
def _process_image_task(self, data: Dict) -> Dict: | |
try: | |
image_path = data['image_path'] | |
settings = data.get('settings', {}) | |
result = self.extract_text_from_image(image_path) | |
self.resource_manager.stats['total_processed'] += 1 | |
self.resource_manager.stats['successful_extractions'] += 1 | |
log_with_check(f"Image task completed for {image_path}") | |
return {"status": "success", "result": result} | |
except Exception as e: | |
self.logger.error(f"Error processing image task: {str(e)}") | |
self.resource_manager.stats['total_processed'] += 1 | |
self.resource_manager.stats['failed_extractions'] += 1 | |
log_with_check(f"Error processing image task: {str(e)}", False) | |
return {"status": "error", "error": str(e)} | |
def _process_document_task(self, data: Dict) -> Dict: | |
try: | |
pdf_path = data['pdf_path'] | |
settings = data.get('settings', {}) | |
result = self.process_pdf(pdf_path) | |
self.resource_manager.stats['total_processed'] += 1 | |
self.resource_manager.stats['successful_extractions'] += 1 | |
log_with_check(f"Document task completed for {pdf_path}") | |
return {"status": "success", "result": result} | |
except Exception as e: | |
self.logger.error(f"Error processing document task: {str(e)}") | |
self.resource_manager.stats['total_processed'] += 1 | |
self.resource_manager.stats['failed_extractions'] += 1 | |
log_with_check(f"Error processing document task: {str(e)}", False) | |
return {"status": "error", "error": str(e)} | |
def extract_text_from_image(self, image_path: str) -> Dict: | |
try: | |
img = cv2.imread(image_path) | |
if img is None: | |
log_with_check(f"Failed to load image: {image_path}", False) | |
return {"error": "Could not load image"} | |
regions = self._perform_ocr(img) | |
log_with_check(f"Extracted text from image: {image_path}") | |
return {"regions": regions, "text": "\n".join([r.get("content", "") for r in regions])} | |
except Exception as e: | |
self.logger.error(f"Error extracting text from image: {str(e)}") | |
log_with_check(f"Error extracting text from image: {str(e)}", False) | |
return {"error": str(e)} | |
def batch_process(self, file_paths: List[str], output_dir: str = None) -> Dict: | |
output_dir = output_dir or CONFIG["OUTPUT_DIR"] | |
os.makedirs(output_dir, exist_ok=True) | |
results = {} | |
for file_path in file_paths: | |
file_ext = Utils.get_file_extension(file_path) | |
file_name = os.path.basename(file_path) | |
output_file = os.path.join(output_dir, file_name.replace(file_ext, ".json")) | |
if file_ext.lower() in [".pdf"]: | |
result = self.process_pdf(file_path) | |
elif file_ext.lower() in [".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".webp"]: | |
result = self.extract_text_from_image(file_path) | |
else: | |
result = {"error": f"Unsupported file type: {file_ext}"} | |
with open(output_file, "w", encoding="utf-8") as f: | |
json.dump(result, f, ensure_ascii=False, indent=2) | |
results[file_name] = {"status": "success" if "error" not in result else "error"} | |
log_with_check(f"Processed {file_name} to {output_file}") | |
return results | |
def extract_text_from_image(image_path: str, output_format: str = "text") -> Dict: | |
processor = DocumentProcessor() | |
regions = processor.extract_text_from_image(image_path) | |
if "error" in regions: | |
return regions | |
if output_format == "text": | |
return {"text": regions["text"]} | |
return regions | |
def extract_text_from_pdf(pdf_path: str, output_format: str = "text") -> Dict: | |
processor = DocumentProcessor() | |
result = processor.process_pdf(pdf_path) | |
if "error" in result: | |
return result | |
if output_format == "text": | |
text = "\n".join([region["content"] for page in result["pages"] for region in page["text_regions"] if "content" in region]) | |
return {"text": text} | |
return result | |
def launch_ui(): | |
processor = DocumentProcessor() | |
def process_file(file, output_format): | |
if file is None: | |
logging.info("No file uploaded") | |
return "لطفاً فایلی آپلود کنید", None | |
if not hasattr(file, 'name') or file.name is None: | |
logging.warning("Invalid file object") | |
return "فایل نامعتبر است", None | |
try: | |
start_time = time.time() | |
logging.info(f"Starting process for file: {file.name}") | |
if file.name.endswith(('.jpg', '.png')): | |
result = processor.extract_text_from_image(file.name) | |
elif file.name.endswith('.pdf'): | |
result = processor.process_pdf(file.name) | |
else: | |
logging.error(f"Unsupported file type: {file.name}") | |
return "نوع فایل پشتیبانی نمیشود", None | |
elapsed_time = time.time() - start_time | |
if "error" in result: | |
logging.error(f"Error processing file {file.name}: {result['error']}") | |
return f"خطا: {result['error']}", None | |
logging.info(f"Processing completed for {file.name} in {elapsed_time:.2f} seconds") | |
output = result.get("text", "متن یافت نشد") if output_format == "text" else json.dumps(result, ensure_ascii=False, indent=2) | |
output_file = os.path.join(CONFIG["TEMP_DIR"], f"{os.path.splitext(file.name)[0]}_output.txt" if output_format == "text" else f"{os.path.splitext(file.name)[0]}_output.json") | |
with open(output_file, "w", encoding="utf-8") as f: | |
f.write(output) | |
log_with_check(f"Output file created: {output_file}") | |
gc.collect() # آزادسازی حافظه پس از ایجاد فایل | |
return gr.update(value=output_file, label="دانلود خروجی", visible=True), output | |
except Exception as e: | |
logging.error(f"Unexpected error processing file {file.name}: {str(e)}") | |
return f"خطای غیرمنتظره: {str(e)}", None | |
# ارتقای رابط کاربری با تم و چیدمان بهتر | |
with gr.Blocks(title=CONFIG["UI_TITLE"], theme=gr.themes.Soft()) as demo: | |
gr.Markdown( | |
f""" | |
# {CONFIG['UI_TITLE']} | |
<p style="text-align: center; color: #4CAF50;">سیستمی برای استخراج متن از تصاویر و PDFها با دقت بالا</p> | |
""", | |
elem_id="title" | |
) | |
with gr.Row(elem_classes="main-row"): | |
with gr.Column(scale=1, elem_classes="input-column"): | |
file_input = gr.File(label="فایل را آپلود کنید", file_types=[".pdf", ".jpg", ".png", ".jpeg"]) | |
output_format = gr.Dropdown(choices=["text", "json"], label="فرمت خروجی", value="text") | |
submit_btn = gr.Button("پردازش", variant="primary") | |
download_output = gr.File(label="دانلود خروجی", visible=False) | |
with gr.Column(scale=2, elem_classes="output-column"): | |
output_text = gr.Textbox(label="متن استخراجشده", lines=10, interactive=False) | |
# اسکریپت جاوااسکریپت برای دانلود | |
demo.js = """ | |
function (file, output) { | |
if (file) { | |
const link = document.createElement('a'); | |
link.href = URL.createObjectURL(file); | |
link.download = file.name; | |
document.body.appendChild(link); | |
link.click(); | |
document.body.removeChild(link); | |
URL.revokeObjectURL(link.href); | |
} | |
return [file, output]; | |
} | |
""" | |
# استایلهای سفارشی | |
demo.css = """ | |
.main-row { | |
padding: 20px; | |
background-color: #f5f5f5; | |
border-radius: 10px; | |
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); | |
} | |
.input-column, .output-column { | |
padding: 10px; | |
animation: fadeIn 0.5s ease-in; | |
} | |
@keyframes fadeIn { | |
from { opacity: 0; } | |
to { opacity: 1; } | |
} | |
#title { | |
font-family: 'Arial', sans-serif; | |
font-size: 24px; | |
margin-bottom: 20px; | |
} | |
""" | |
submit_btn.click( | |
fn=process_file, | |
inputs=[file_input, output_format], | |
outputs=[download_output, output_text] | |
).then( | |
fn=lambda x: gr.update(visible=True) if x else gr.update(visible=False), | |
inputs=[download_output], | |
outputs=[download_output] | |
) | |
try: | |
demo.launch(server_name="0.0.0.0", server_port=7860) | |
log_with_check("Gradio interface launched") | |
except Exception as e: | |
logging.error(f"Failed to launch Gradio interface: {str(e)}") | |
log_with_check("Gradio interface launch failed", False) | |
def setup_signal_handlers(): | |
def signal_handler(sig, frame): | |
logging.info("Shutdown signal received, cleaning up...") | |
log_with_check("Starting shutdown cleanup") | |
try: | |
for filename in os.listdir(CONFIG["TEMP_DIR"]): | |
file_path = os.path.join(CONFIG["TEMP_DIR"], filename) | |
if os.path.isfile(file_path): | |
file_age = time.time() - os.path.getmtime(file_path) | |
if file_age > 3600: | |
os.remove(file_path) | |
gc.collect() # آزادسازی حافظه در زمان تعطیلی | |
except: | |
pass | |
log_with_check("Cleanup completed") | |
sys.exit(0) | |
signal.signal(signal.SIGINT, signal_handler) | |
signal.signal(signal.SIGTERM, signal_handler) | |
def check_dependencies(): | |
missing = [] | |
try: | |
pytesseract.get_tesseract_version() | |
logging.info("Tesseract installed") | |
log_with_check("Tesseract dependency checked") | |
except: | |
missing.append("Tesseract OCR") | |
log_with_check("Tesseract dependency check failed", False) | |
logging.info(f"Missing dependencies: {', '.join(missing)}" if missing else "All dependencies are present") | |
log_with_check("Dependencies check completed") | |
def install_tesseract_method1(): | |
"""روش اول: نصب Tesseract با استفاده از subprocess.check_call""" | |
try: | |
subprocess.check_call(['apt-get', 'update', '-y', '--fix-missing', '--allow-downgrades']) | |
subprocess.check_call(['apt-get', 'install', 'tesseract-ocr', '-y', '--no-install-recommends', '--fix-missing', '--allow-downgrades', '--fix-broken']) | |
os.environ["TESSDATA_PREFIX"] = os.path.abspath(CONFIG["TESSDATA_LOCAL"]) | |
pytesseract.pytesseract.tesseract_cmd = CONFIG["TESSERACT_CMD"] | |
version = pytesseract.get_tesseract_version() | |
logging.info(f"Tesseract installed via Method 1, version: {version}") | |
log_with_check("Tesseract installed via Method 1") | |
return True | |
except subprocess.CalledProcessError as e: | |
logging.warning(f"Method 1 failed: {str(e)}") | |
log_with_check("Tesseract installation Method 1 failed", False) | |
return False | |
def install_tesseract_method2(): | |
"""روش دوم: نصب Tesseract با استفاده از os.system""" | |
logging.warning("Attempting Method 2: manual installation") | |
log_with_check("Attempting Method 2 installation", False) | |
os.system('chmod 777 /tmp') | |
os.system('apt-get update -y --fix-missing --allow-downgrades') | |
os.system('apt-get install tesseract-ocr -y --no-install-recommends --fix-missing --allow-downgrades --fix-broken') | |
os.system('pip install -q pytesseract') | |
if __name__ == "__main__": | |
print("YES") | |
os.environ["TESSDATA_PREFIX"] = os.path.abspath(CONFIG["TESSDATA_LOCAL"]) | |
pytesseract.pytesseract.tesseract_cmd = CONFIG["TESSERACT_CMD"] | |
try: | |
version = pytesseract.get_tesseract_version() | |
logging.info(f"Tesseract installed via Method 2, version: {version}") | |
log_with_check("Tesseract installed via Method 2") | |
return True | |
except Exception as e: | |
logging.error(f"Method 2 failed: {str(e)}") | |
log_with_check("Tesseract installation Method 2 failed", False) | |
return False | |
def setup_tesseract_alternative(): | |
"""نصب Tesseract با دو روش به صورت متوالی""" | |
log_with_check("Starting Tesseract setup") | |
if not install_tesseract_method1(): | |
install_tesseract_method2() | |
if __name__ == "__main__": | |
setup_signal_handlers() | |
setup_tesseract_alternative() # ابتدا Tesseract را نصب میکنیم | |
check_dependencies() # سپس وابستگیها را چک میکنیم | |
try: | |
launch_ui() | |
except Exception as e: | |
logging.error(f"Application failed to start: {str(e)}") | |
log_with_check("Application startup failed", False) |