Spaces:
Running
Running
Delete app/persian_ocr/app.py
Browse files- app/persian_ocr/app.py +0 -998
app/persian_ocr/app.py
DELETED
@@ -1,998 +0,0 @@
|
|
1 |
-
#!/usr/bin/env python3
|
2 |
-
# -*- coding: utf-8 -*-
|
3 |
-
|
4 |
-
import os
|
5 |
-
import subprocess
|
6 |
-
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
|
7 |
-
import tensorflow as tf
|
8 |
-
tf.get_logger().setLevel('ERROR')
|
9 |
-
|
10 |
-
import logging
|
11 |
-
from logging.handlers import RotatingFileHandler
|
12 |
-
import pytesseract
|
13 |
-
import easyocr
|
14 |
-
import cv2
|
15 |
-
import numpy as np
|
16 |
-
from PIL import Image
|
17 |
-
import re
|
18 |
-
from typing import Dict, List, Optional, Any, Union, Tuple
|
19 |
-
from hazm import Normalizer, word_tokenize
|
20 |
-
import json
|
21 |
-
from datetime import datetime
|
22 |
-
from transformers import pipeline, TrOCRProcessor, VisionEncoderDecoderModel
|
23 |
-
import gradio as gr
|
24 |
-
import fitz
|
25 |
-
from tqdm import tqdm
|
26 |
-
from dataclasses import dataclass
|
27 |
-
from functools import lru_cache
|
28 |
-
import threading
|
29 |
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
30 |
-
import tempfile
|
31 |
-
import shutil
|
32 |
-
from pathlib import Path
|
33 |
-
import hashlib
|
34 |
-
import pickle
|
35 |
-
from collections import OrderedDict
|
36 |
-
import time
|
37 |
-
import torch
|
38 |
-
import psutil
|
39 |
-
import warnings
|
40 |
-
import io
|
41 |
-
|
42 |
-
warnings.filterwarnings('ignore')
|
43 |
-
from collections import namedtuple
|
44 |
-
|
45 |
-
# --- ثابتهای سیستم ---
|
46 |
-
TESSERACT_CMD = '/usr/bin/tesseract'
|
47 |
-
TESSDATA_PREFIX = '/usr/share/tesseract-ocr/4.00/tessdata'
|
48 |
-
SUCCESS = "✅"
|
49 |
-
FAILURE = "❌"
|
50 |
-
PROCESSING = "🔄"
|
51 |
-
|
52 |
-
# --- نصب وابستگیهای سیستمی برای Hugging Face Spaces ---
|
53 |
-
def setup_system_dependencies():
|
54 |
-
logger.info("START: نصب وابستگیهای سیستمی برای Hugging Face Spaces")
|
55 |
-
try:
|
56 |
-
# بررسی و نصب Tesseract و زبان فارسی
|
57 |
-
if not os.path.isfile(TESSERACT_CMD):
|
58 |
-
logger.info("PROCESSING: نصب Tesseract OCR")
|
59 |
-
subprocess.run(['apt-get', 'update'], check=True)
|
60 |
-
subprocess.run(['apt-get', 'install', '-y', 'tesseract-ocr'], check=True)
|
61 |
-
subprocess.run(['apt-get', 'install', '-y', 'tesseract-ocr-fas'], check=True)
|
62 |
-
|
63 |
-
# نصب OpenCV و Fontconfig (برای Matplotlib و Fontconfig)
|
64 |
-
subprocess.run(['apt-get', 'install', '-y', 'libopencv-dev'], check=True)
|
65 |
-
subprocess.run(['apt-get', 'install', '-y', 'fontconfig'], check=True)
|
66 |
-
|
67 |
-
# تنظیم مسیر Tesseract
|
68 |
-
pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD
|
69 |
-
os.environ['TESSDATA_PREFIX'] = TESSDATA_PREFIX
|
70 |
-
logger.info("SUCCESS: وابستگیهای سیستمی نصب شدند")
|
71 |
-
except Exception as e:
|
72 |
-
logger.error(f"FAILURE: خطا در نصب وابستگیهای سیستمی: {str(e)}")
|
73 |
-
raise
|
74 |
-
|
75 |
-
# --- توابع Persian-OCR (از detect.py) ---
|
76 |
-
def get_grayscale(image):
|
77 |
-
if image is None or not isinstance(image, np.ndarray) or image.size == 0 or len(image.shape) < 2:
|
78 |
-
return None
|
79 |
-
return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
80 |
-
|
81 |
-
def remove_noise(image):
|
82 |
-
if image is None or not isinstance(image, np.ndarray) or image.size == 0 or len(image.shape) < 2:
|
83 |
-
return None
|
84 |
-
return cv2.medianBlur(image, 3)
|
85 |
-
|
86 |
-
def thresholding(image):
|
87 |
-
if image is None or not isinstance(image, np.ndarray) or image.size == 0 or len(image.shape) < 2:
|
88 |
-
return None, None
|
89 |
-
return cv2.threshold(image, 160, 255, cv2.THRESH_BINARY)[1]
|
90 |
-
|
91 |
-
def dilate(image):
|
92 |
-
if image is None or not isinstance(image, np.ndarray) or image.size == 0 or len(image.shape) < 2:
|
93 |
-
return None
|
94 |
-
kernel = np.ones((5, 5), np.uint8)
|
95 |
-
return cv2.dilate(image, kernel, iterations=1)
|
96 |
-
|
97 |
-
def erode(image):
|
98 |
-
if image is None or not isinstance(image, np.ndarray) or image.size == 0 or len(image.shape) < 2:
|
99 |
-
return None
|
100 |
-
kernel = np.ones((5, 5), np.uint8)
|
101 |
-
return cv2.erode(image, kernel, iterations=1)
|
102 |
-
|
103 |
-
def opening(image):
|
104 |
-
if image is None or not isinstance(image, np.ndarray) or image.size == 0 or len(image.shape) < 2:
|
105 |
-
return None
|
106 |
-
kernel = np.ones((5, 5), np.uint8)
|
107 |
-
return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)
|
108 |
-
|
109 |
-
def canny(image):
|
110 |
-
if image is None or not isinstance(image, np.ndarray) or image.size == 0 or len(image.shape) < 2:
|
111 |
-
return None
|
112 |
-
return cv2.Canny(image, 100, 200)
|
113 |
-
|
114 |
-
def deskew(image):
|
115 |
-
if image is None or not isinstance(image, np.ndarray) or image.size == 0 or len(image.shape) < 2:
|
116 |
-
return None
|
117 |
-
coords = np.column_stack(np.where(image > 0))
|
118 |
-
if coords.size == 0:
|
119 |
-
return image
|
120 |
-
angle = cv2.minAreaRect(coords)[-1]
|
121 |
-
if angle < -45:
|
122 |
-
angle = -(90 + angle)
|
123 |
-
else:
|
124 |
-
angle = -angle
|
125 |
-
(h, w) = image.shape[:2]
|
126 |
-
center = (w // 2, h // 2)
|
127 |
-
M = cv2.getRotationMatrix2D(center, angle, 1.0)
|
128 |
-
rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
|
129 |
-
return rotated
|
130 |
-
|
131 |
-
def match_template(image, template):
|
132 |
-
if image is None or template is None or not isinstance(image, np.ndarray) or not isinstance(template, np.ndarray):
|
133 |
-
return None
|
134 |
-
return cv2.matchTemplate(image, template, cv2.TM_CCOEFF_NORMED)
|
135 |
-
|
136 |
-
def persian_ocr_main(image: np.ndarray, langs="fa", mode="tn"):
|
137 |
-
if image is None or not isinstance(image, np.ndarray) or image.size == 0 or len(image.shape) < 2:
|
138 |
-
return ""
|
139 |
-
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_input:
|
140 |
-
cv2.imwrite(temp_input.name, image)
|
141 |
-
temp_input_path = temp_input.name
|
142 |
-
|
143 |
-
with tempfile.NamedTemporaryFile(suffix='.txt', delete=False) as temp_output:
|
144 |
-
temp_output_path = temp_output.name
|
145 |
-
|
146 |
-
im = Image.open(temp_input_path)
|
147 |
-
length_x, width_y = im.size
|
148 |
-
factor = float(1024.0 / length_x)
|
149 |
-
size = int(factor * length_x), int(factor * width_y)
|
150 |
-
image_resize = im.resize(size, Image.Resampling.LANCZOS)
|
151 |
-
image_resize.save(f"{temp_input_path}_Upscaled.png", dpi=(300, 300))
|
152 |
-
img = cv2.imread(f"{temp_input_path}_Upscaled.png")
|
153 |
-
gray = get_grayscale(img)
|
154 |
-
if gray is None:
|
155 |
-
os.remove(temp_input_path)
|
156 |
-
os.remove(temp_output_path)
|
157 |
-
return ""
|
158 |
-
img = gray # فقط grayscale فعلاً فعاله
|
159 |
-
if langs == "fa":
|
160 |
-
if mode == "t":
|
161 |
-
custom_config = r'-l fas --psm 6 -c tessedit_char_blacklist="۰١۲۳۴۵۶۷۸۹«»1234567890#"'
|
162 |
-
elif mode == "tn":
|
163 |
-
custom_config = r'-l fas --psm 6 -c tessedit_char_whitelist="آابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی ۰١۲۳۴۵۶۷۸۹.?!,،:;/"'
|
164 |
-
elif mode == "table":
|
165 |
-
custom_config = r'-l fas --psm 6 -c tessedit_char_whitelist="آابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی۰١۲۳۴۵۶۷۸۹"'
|
166 |
-
elif langs == "en":
|
167 |
-
custom_config = r'-l eng --psm 6'
|
168 |
-
elif langs == "faen":
|
169 |
-
custom_config = r'-l fas+eng --psm 6'
|
170 |
-
else:
|
171 |
-
raise ValueError("Choose valid language options.")
|
172 |
-
|
173 |
-
text = pytesseract.image_to_string(img, config=custom_config)
|
174 |
-
with io.open(temp_output_path, 'w', encoding='utf8') as f:
|
175 |
-
f.write(text)
|
176 |
-
|
177 |
-
os.remove(temp_input_path)
|
178 |
-
os.remove(f"{temp_input_path}_Upscaled.png")
|
179 |
-
os.remove(temp_output_path)
|
180 |
-
return text
|
181 |
-
|
182 |
-
# --- پیکربندی گزارشگیری ---
|
183 |
-
class CustomFormatter(logging.Formatter):
|
184 |
-
grey = "\x1b[38;21m"
|
185 |
-
blue = "\x1b[38;5;39m"
|
186 |
-
yellow = "\x1b[38;5;226m"
|
187 |
-
red = "\x1b[38;5;196m"
|
188 |
-
bold_red = "\x1b[31;1m"
|
189 |
-
reset = "\x1b[0m"
|
190 |
-
STATUS_EMOJI = {
|
191 |
-
'START': '🟦', 'SUCCESS': '✅', 'FAILURE': '❌', 'LOADING': '⏳',
|
192 |
-
'PROCESSING': '🔄', 'WARNING': '⚠️', 'MEMORY': '💾'
|
193 |
-
}
|
194 |
-
def __init__(self, fmt):
|
195 |
-
super().__init__()
|
196 |
-
self.fmt = fmt
|
197 |
-
self.FORMATS = {
|
198 |
-
logging.DEBUG: self.grey + self.fmt + self.reset,
|
199 |
-
logging.INFO: self.blue + self.fmt + self.reset,
|
200 |
-
logging.WARNING: self.yellow + self.fmt + self.reset,
|
201 |
-
logging.ERROR: self.red + self.fmt + self.reset,
|
202 |
-
logging.CRITICAL: self.bold_red + self.fmt + self.reset
|
203 |
-
}
|
204 |
-
def format(self, record):
|
205 |
-
log_fmt = self.FORMATS.get(record.levelno)
|
206 |
-
formatter = logging.Formatter(log_fmt)
|
207 |
-
memory_usage = psutil.virtual_memory().percent
|
208 |
-
record.msg = f"{record.msg} [Mem: {memory_usage:.1f}%]"
|
209 |
-
for status, emoji in self.STATUS_EMOJI.items():
|
210 |
-
if status in record.msg:
|
211 |
-
record.msg = f"{emoji} {record.msg}"
|
212 |
-
return formatter.format(record)
|
213 |
-
|
214 |
-
logger = logging.getLogger(__name__)
|
215 |
-
logger.setLevel(logging.DEBUG)
|
216 |
-
logs_dir = "/app/logs"
|
217 |
-
os.makedirs(logs_dir, exist_ok=True)
|
218 |
-
log_file = os.path.join(logs_dir, "ocr.log")
|
219 |
-
file_handler = RotatingFileHandler(log_file, maxBytes=10*1024*1024, backupCount=5, encoding='utf-8')
|
220 |
-
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
|
221 |
-
console_handler = logging.StreamHandler()
|
222 |
-
console_handler.setFormatter(CustomFormatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
|
223 |
-
logger.addHandler(file_handler)
|
224 |
-
logger.addHandler(console_handler)
|
225 |
-
|
226 |
-
# --- Dataclass OCRResult ---
|
227 |
-
OCRResult = namedtuple('OCRResult', ['text', 'numbers', 'confidence', 'model_name', 'processing_time',
|
228 |
-
'image_quality', 'detected_language', 'word_count', 'char_count',
|
229 |
-
'preprocessing_info', 'error_rate'])
|
230 |
-
|
231 |
-
# --- مدیریت مدل ---
|
232 |
-
models = {}
|
233 |
-
model_performance = {
|
234 |
-
'pretrained_model': {'success': 0, 'fail': 0},
|
235 |
-
'mT5_OCR_fa': {'success': 0, 'fail': 0},
|
236 |
-
'LayoutLMv3_fa': {'success': 0, 'fail': 0},
|
237 |
-
'easyocr': {'success': 0, 'fail': 0},
|
238 |
-
'tesseract': {'success': 0, 'fail': 0},
|
239 |
-
'persian_ocr': {'success': 0, 'fail': 0}
|
240 |
-
}
|
241 |
-
model_configs = {
|
242 |
-
'pretrained_model': {'name': "beheshti-ai/TrOCR-fa", 'type': "transformer", 'threshold': 0.8, 'device': "cpu"},
|
243 |
-
'mT5_OCR_fa': {'name': "aleemeconomist/mT5-OCR-fa", 'type': "image-to-text", 'threshold': 0.7, 'device': "cpu"},
|
244 |
-
'LayoutLMv3_fa': {'name': "SoheilStar/LayoutLMv3-fa", 'type': "document-question-answering", 'threshold': 0.7, 'device': "cpu"},
|
245 |
-
'persian_ocr': {'name': "Persian-OCR", 'type': "custom", 'threshold': 0.75, 'device': "cpu"}
|
246 |
-
}
|
247 |
-
model_priority = ['pretrained_model', 'mT5_OCR_fa', 'LayoutLMv3_fa', 'easyocr', 'tesseract', 'persian_ocr']
|
248 |
-
model_lock = threading.Lock()
|
249 |
-
normalizer = Normalizer()
|
250 |
-
|
251 |
-
def load_model(model_name: str, progress=None):
|
252 |
-
global models, model_performance, model_configs, model_lock
|
253 |
-
with model_lock:
|
254 |
-
if model_name in models:
|
255 |
-
return True
|
256 |
-
|
257 |
-
logger.info(f"START Loading model: {model_name}")
|
258 |
-
try:
|
259 |
-
config = model_configs.get(model_name)
|
260 |
-
if config:
|
261 |
-
if config['type'] == "image-to-text":
|
262 |
-
models[model_name] = pipeline(config['type'], model=config['name'], device=config['device'])
|
263 |
-
elif config['type'] == "document-question-answering":
|
264 |
-
models[model_name] = pipeline(config['type'], model=config['name'], device=config['device'])
|
265 |
-
elif config['type'] == "transformer":
|
266 |
-
if progress:
|
267 |
-
progress(0.3)
|
268 |
-
processor = TrOCRProcessor.from_pretrained(config['name'])
|
269 |
-
if progress:
|
270 |
-
progress(0.6)
|
271 |
-
model_instance = VisionEncoderDecoderModel.from_pretrained(config['name'])
|
272 |
-
models[model_name] = {'processor': processor, 'model': model_instance, 'device': config['device']}
|
273 |
-
elif config['type'] == "custom" and model_name == "persian_ocr":
|
274 |
-
models[model_name] = True # نیازی به بارگذاری خاص نداره
|
275 |
-
elif model_name == "easyocr":
|
276 |
-
if progress:
|
277 |
-
progress(0.5)
|
278 |
-
models[model_name] = easyocr.Reader(['fa', 'en'], gpu=torch.cuda.is_available())
|
279 |
-
elif model_name == "tesseract":
|
280 |
-
# تنظیم Tesseract برای Hugging Face Spaces
|
281 |
-
pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD
|
282 |
-
os.environ['TESSDATA_PREFIX'] = TESSDATA_PREFIX
|
283 |
-
models[model_name] = True
|
284 |
-
|
285 |
-
logger.info(f"SUCCESS Model {model_name} loaded")
|
286 |
-
model_performance[model_name]['success'] += 1
|
287 |
-
if progress:
|
288 |
-
progress(1.0)
|
289 |
-
return True
|
290 |
-
except Exception as e:
|
291 |
-
logger.error(f"FAILURE Error loading {model_name}: {str(e)}")
|
292 |
-
model_performance[model_name]['fail'] += 1
|
293 |
-
if progress:
|
294 |
-
progress(1.0)
|
295 |
-
return False
|
296 |
-
|
297 |
-
def process_image(image: np.ndarray, progress=None):
|
298 |
-
global models, model_performance, model_priority, model_configs
|
299 |
-
start_time = time.time()
|
300 |
-
if progress:
|
301 |
-
progress(0.0)
|
302 |
-
|
303 |
-
if image is None or not isinstance(image, np.ndarray) or image.size == 0 or len(image.shape) < 2:
|
304 |
-
logger.error("FAILURE Input image to process_image is invalid or empty")
|
305 |
-
raise ValueError("Input image is invalid or empty")
|
306 |
-
|
307 |
-
logger.debug(f"Processing image with shape: {image.shape}")
|
308 |
-
|
309 |
-
sorted_models_priority = sorted(
|
310 |
-
model_priority,
|
311 |
-
key=lambda x: model_performance[x]['success'] / (model_performance[x]['fail'] + 1),
|
312 |
-
reverse=True
|
313 |
-
)
|
314 |
-
|
315 |
-
for i, model_name in enumerate(sorted_models_priority):
|
316 |
-
try:
|
317 |
-
if not load_model(model_name, progress):
|
318 |
-
continue
|
319 |
-
|
320 |
-
if progress:
|
321 |
-
progress((i + 1) / len(sorted_models_priority))
|
322 |
-
|
323 |
-
result_dict = None
|
324 |
-
config = model_configs.get(model_name)
|
325 |
-
if model_name in model_configs:
|
326 |
-
if config['type'] == "transformer":
|
327 |
-
result_dict = _process_transformer_model_full(image, model_name)
|
328 |
-
elif config['type'] == "image-to-text":
|
329 |
-
result_dict = _process_transformer_model(image, model_name)
|
330 |
-
elif config['type'] == "document-question-answering":
|
331 |
-
result_dict = _process_transformer_model(image, model_name)
|
332 |
-
elif model_name == 'persian_ocr':
|
333 |
-
result_dict = _process_persian_ocr(image)
|
334 |
-
elif model_name == 'easyocr':
|
335 |
-
result_dict = _process_easyocr(image)
|
336 |
-
elif model_name == 'tesseract':
|
337 |
-
result_dict = _process_tesseract(image)
|
338 |
-
|
339 |
-
if result_dict and 'text' in result_dict and result_dict['text'].strip():
|
340 |
-
processing_time = time.time() - start_time
|
341 |
-
ocr_result = _format_result(
|
342 |
-
result_dict['text'],
|
343 |
-
result_dict.get('confidence', 0.5),
|
344 |
-
model_name,
|
345 |
-
processing_time
|
346 |
-
)
|
347 |
-
|
348 |
-
threshold = model_configs.get(model_name, {}).get('threshold', 0.5)
|
349 |
-
if ocr_result.confidence >= threshold:
|
350 |
-
logger.info(f"SUCCESS Model {model_name} succeeded")
|
351 |
-
if progress:
|
352 |
-
progress(1.0)
|
353 |
-
return ocr_result
|
354 |
-
except Exception as e:
|
355 |
-
logger.warning(f"WARNING Model {model_name} failed: {str(e)}")
|
356 |
-
continue
|
357 |
-
|
358 |
-
logger.warning("WARNING No model succeeded")
|
359 |
-
if progress:
|
360 |
-
progress(1.0)
|
361 |
-
return None
|
362 |
-
|
363 |
-
def _process_transformer_model(image: np.ndarray, model_name: str):
|
364 |
-
global models, model_configs
|
365 |
-
pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
|
366 |
-
model = models[model_name]
|
367 |
-
|
368 |
-
if model_configs[model_name]['type'] == "image-to-text":
|
369 |
-
result = model(pil_image)[0]
|
370 |
-
return {'text': result['generated_text'], 'confidence': model_configs[model_name]['threshold']}
|
371 |
-
else: # "document-question-answering"
|
372 |
-
result = model(pil_image)
|
373 |
-
return {'text': result['answer'], 'confidence': model_configs[model_name]['threshold']}
|
374 |
-
|
375 |
-
def _process_transformer_model_full(image: np.ndarray, model_name: str):
|
376 |
-
global models, model_configs
|
377 |
-
pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
|
378 |
-
processor = models[model_name]['processor']
|
379 |
-
model = models[model_name]['model']
|
380 |
-
device = models[model_name]['device']
|
381 |
-
|
382 |
-
pixel_values = processor(images=pil_image, return_tensors="pt").pixel_values.to(device)
|
383 |
-
generated_ids = model.generate(pixel_values)
|
384 |
-
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
385 |
-
|
386 |
-
return {'text': generated_text, 'confidence': model_configs[model_name]['threshold']}
|
387 |
-
|
388 |
-
def _process_easyocr(image: np.ndarray):
|
389 |
-
global models
|
390 |
-
results = models['easyocr'].readtext(image)
|
391 |
-
if not results:
|
392 |
-
return {'text': '', 'confidence': 0}
|
393 |
-
texts = [text for _, text, _ in results]
|
394 |
-
confidence_sum = sum(conf for _, _, conf in results)
|
395 |
-
confidence_avg = confidence_sum / len(results) if results else 0
|
396 |
-
return {'text': ' '.join(texts), 'confidence': confidence_avg}
|
397 |
-
|
398 |
-
def _process_tesseract(image: np.ndarray):
|
399 |
-
text = pytesseract.image_to_string(image, config='--oem 3 --psm 6 -l fas+eng')
|
400 |
-
return {'text': text, 'confidence': 0.5}
|
401 |
-
|
402 |
-
def _process_persian_ocr(image: np.ndarray):
|
403 |
-
try:
|
404 |
-
if image is None or not isinstance(image, np.ndarray) or image.size == 0 or len(image.shape) < 2:
|
405 |
-
return {'text': '', 'confidence': 0}
|
406 |
-
text = persian_ocr_main(image, langs="fa", mode="tn")
|
407 |
-
return {'text': text, 'confidence': 0.75}
|
408 |
-
except Exception as e:
|
409 |
-
logger.error(f"FAILURE Persian-OCR processing failed: {str(e)}")
|
410 |
-
return {'text': '', 'confidence': 0}
|
411 |
-
|
412 |
-
def _format_result(text: str, confidence: float, model_name: str, processing_time: float):
|
413 |
-
global normalizer
|
414 |
-
normalized_text = normalizer.normalize(text)
|
415 |
-
words = word_tokenize(normalized_text)
|
416 |
-
persian_nums = '۰۱۲۳۴۵۶۷۸۹'
|
417 |
-
number_pattern = f'^[0-9{persian_nums}]+([\\.,،٫][0-9{persian_nums}]+)?$'
|
418 |
-
numbers = [w for w in words if re.match(number_pattern, w)]
|
419 |
-
text_list = [w for w in words if not re.match(number_pattern, w)]
|
420 |
-
|
421 |
-
return OCRResult(
|
422 |
-
text=text_list,
|
423 |
-
numbers=numbers,
|
424 |
-
confidence=confidence,
|
425 |
-
model_name=model_name,
|
426 |
-
processing_time=processing_time,
|
427 |
-
image_quality=_assess_quality(text_list),
|
428 |
-
detected_language=_detect_language(text_list),
|
429 |
-
word_count=len(text_list),
|
430 |
-
char_count=sum(len(w) for w in text_list),
|
431 |
-
preprocessing_info={},
|
432 |
-
error_rate=_estimate_error_rate(text_list, confidence)
|
433 |
-
)
|
434 |
-
|
435 |
-
def _estimate_error_rate(text_list: List[str], confidence: float):
|
436 |
-
if not text_list:
|
437 |
-
return 1.0
|
438 |
-
avg_word_length = sum(len(w) for w in text_list) / len(text_list) if text_list else 0
|
439 |
-
return max(0.0, min(1.0, 1.0 - confidence + (3 - avg_word_length) / 10))
|
440 |
-
|
441 |
-
def _assess_quality(text_list: List[str]):
|
442 |
-
if not text_list:
|
443 |
-
return "Low"
|
444 |
-
avg_word_length = sum(len(w) for w in text_list) / len(text_list) if text_list else 0
|
445 |
-
word_count = len(text_list)
|
446 |
-
return "High" if word_count > 50 and avg_word_length > 3 else "Medium" if word_count > 20 and avg_word_length > 2 else "Low"
|
447 |
-
|
448 |
-
def _detect_language(text_list: List[str]):
|
449 |
-
if not text_list:
|
450 |
-
return "Unknown"
|
451 |
-
persian_pattern = re.compile(r'[\u0600-\u06FF]')
|
452 |
-
english_pattern = re.compile(r'[a-zA-Z]')
|
453 |
-
persian_chars = sum(1 for word in text_list for _ in persian_pattern.finditer(word))
|
454 |
-
english_chars = sum(1 for word in text_list for _ in english_pattern.finditer(word))
|
455 |
-
return "Persian" if persian_chars > english_chars else "English" if english_chars > persian_chars else "Mixed"
|
456 |
-
|
457 |
-
# --- ImagePreprocessor functions ---
|
458 |
-
default_preprocessing_settings = {
|
459 |
-
'resize': True, 'resize_scale': 200, 'enhance_contrast': True, 'reduce_noise': True,
|
460 |
-
'sharpen': True, 'deskew': True, 'threshold': True
|
461 |
-
}
|
462 |
-
|
463 |
-
def enhance_for_persian(image: np.ndarray, settings: Dict[str, Any], progress=None):
|
464 |
-
info = {}
|
465 |
-
if image is None or not isinstance(image, np.ndarray) or image.size == 0 or len(image.shape) < 2:
|
466 |
-
logger.error("FAILURE Input image to enhance_for_persian is invalid or empty")
|
467 |
-
return None, {}
|
468 |
-
|
469 |
-
logger.debug(f"Enhancing image with shape: {image.shape}")
|
470 |
-
processed = image.copy()
|
471 |
-
current_settings = default_preprocessing_settings.copy()
|
472 |
-
current_settings.update(settings)
|
473 |
-
|
474 |
-
try:
|
475 |
-
step = 0
|
476 |
-
total_steps = 7
|
477 |
-
if progress:
|
478 |
-
progress(step / total_steps)
|
479 |
-
|
480 |
-
height, width = processed.shape[:2]
|
481 |
-
if height <= 0 or width <= 0:
|
482 |
-
logger.error(f"FAILURE Invalid image dimensions: height={height}, width={width}")
|
483 |
-
return None, {}
|
484 |
-
|
485 |
-
logger.debug(f"Image shape before grayscale: {processed.shape}")
|
486 |
-
if len(processed.shape) == 3:
|
487 |
-
try:
|
488 |
-
processed = cv2.cvtColor(processed, cv2.COLOR_BGR2GRAY)
|
489 |
-
info['grayscale'] = True
|
490 |
-
except cv2.error as e:
|
491 |
-
logger.error(f"FAILURE Grayscale conversion failed: {str(e)}")
|
492 |
-
return None, {}
|
493 |
-
step += 1
|
494 |
-
if progress:
|
495 |
-
progress(step / total_steps)
|
496 |
-
|
497 |
-
logger.debug(f"Image shape after grayscale: {processed.shape}")
|
498 |
-
if current_settings.get('resize'):
|
499 |
-
scale_percent = current_settings.get('resize_scale', 200)
|
500 |
-
if scale_percent != 100:
|
501 |
-
new_width = int(width * scale_percent / 100)
|
502 |
-
new_height = int(height * scale_percent / 100)
|
503 |
-
if new_width > 0 and new_height > 0:
|
504 |
-
try:
|
505 |
-
processed = cv2.resize(processed, (new_width, new_height), interpolation=cv2.INTER_CUBIC)
|
506 |
-
info['resized'] = f"{scale_percent}%"
|
507 |
-
except cv2.error as e:
|
508 |
-
logger.error(f"FAILURE Resize failed: {str(e)}")
|
509 |
-
return None, {}
|
510 |
-
else:
|
511 |
-
logger.warning(f"WARNING Resize skipped due to invalid dimensions: width={new_width}, height={new_height}")
|
512 |
-
step += 1
|
513 |
-
if progress:
|
514 |
-
progress(step / total_steps)
|
515 |
-
|
516 |
-
logger.debug(f"Image shape after resize: {processed.shape}")
|
517 |
-
if current_settings.get('enhance_contrast'):
|
518 |
-
try:
|
519 |
-
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
|
520 |
-
processed = clahe.apply(processed)
|
521 |
-
info['contrast_enhanced'] = True
|
522 |
-
except cv2.error as e:
|
523 |
-
logger.error(f"FAILURE Contrast enhancement failed: {str(e)}")
|
524 |
-
return None, {}
|
525 |
-
step += 1
|
526 |
-
if progress:
|
527 |
-
progress(step / total_steps)
|
528 |
-
|
529 |
-
logger.debug(f"Image shape after contrast: {processed.shape}")
|
530 |
-
if current_settings.get('reduce_noise'):
|
531 |
-
try:
|
532 |
-
processed = cv2.bilateralFilter(processed, 9, 75, 75)
|
533 |
-
info['noise_reduced'] = True
|
534 |
-
except cv2.error as e:
|
535 |
-
logger.error(f"FAILURE Noise reduction failed: {str(e)}")
|
536 |
-
return None, {}
|
537 |
-
step += 1
|
538 |
-
if progress:
|
539 |
-
progress(step / total_steps)
|
540 |
-
|
541 |
-
logger.debug(f"Image shape after noise reduction: {processed.shape}")
|
542 |
-
if current_settings.get('sharpen'):
|
543 |
-
try:
|
544 |
-
kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
|
545 |
-
processed = cv2.filter2D(processed, -1, kernel)
|
546 |
-
info['sharpened'] = True
|
547 |
-
except cv2.error as e:
|
548 |
-
logger.error(f"FAILURE Sharpening failed: {str(e)}")
|
549 |
-
return None, {}
|
550 |
-
step += 1
|
551 |
-
if progress:
|
552 |
-
progress(step / total_steps)
|
553 |
-
|
554 |
-
logger.debug(f"Image shape after sharpen: {processed.shape}")
|
555 |
-
if current_settings.get('deskew'):
|
556 |
-
try:
|
557 |
-
coords = np.column_stack(np.where(processed > 0))
|
558 |
-
if coords.size > 0:
|
559 |
-
angle = cv2.minAreaRect(coords)[-1]
|
560 |
-
if angle < -45:
|
561 |
-
angle = 90 + angle
|
562 |
-
center = (processed.shape[1] // 2, processed.shape[0] // 2)
|
563 |
-
M = cv2.getRotationMatrix2D(center, angle, 1.0)
|
564 |
-
processed = cv2.warpAffine(processed, M, (processed.shape[1], processed.shape[0]),
|
565 |
-
flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
|
566 |
-
info['deskewed'] = f"angle: {angle:.2f}"
|
567 |
-
else:
|
568 |
-
logger.warning("WARNING No contours found for deskewing")
|
569 |
-
except Exception as e:
|
570 |
-
logger.warning(f"WARNING Deskew failed: {e}")
|
571 |
-
step += 1
|
572 |
-
if progress:
|
573 |
-
progress(step / total_steps)
|
574 |
-
|
575 |
-
logger.debug(f"Image shape after deskew: {processed.shape}")
|
576 |
-
if current_settings.get('threshold'):
|
577 |
-
try:
|
578 |
-
processed = cv2.adaptiveThreshold(processed, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
579 |
-
cv2.THRESH_BINARY, 11, 2)
|
580 |
-
info['thresholded'] = True
|
581 |
-
except cv2.error as e:
|
582 |
-
logger.error(f"FAILURE Thresholding failed: {str(e)}")
|
583 |
-
return None, {}
|
584 |
-
step += 1
|
585 |
-
if progress:
|
586 |
-
progress(1.0)
|
587 |
-
|
588 |
-
logger.debug(f"Image shape after threshold: {processed.shape}")
|
589 |
-
return processed, info
|
590 |
-
except Exception as e:
|
591 |
-
logger.error(f"FAILURE Preprocessing error: {str(e)}")
|
592 |
-
if progress:
|
593 |
-
progress(1.0)
|
594 |
-
return None, {}
|
595 |
-
|
596 |
-
def remove_background(image: np.ndarray):
|
597 |
-
if image is None or not isinstance(image, np.ndarray) or image.size == 0 or len(image.shape) < 2:
|
598 |
-
logger.error("FAILURE Input image for background removal is invalid or empty")
|
599 |
-
return None
|
600 |
-
try:
|
601 |
-
logger.debug(f"Removing background from image with shape: {image.shape}")
|
602 |
-
gray = image if len(image.shape) == 2 else cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
603 |
-
mask = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
|
604 |
-
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
|
605 |
-
mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel, iterations=1)
|
606 |
-
return cv2.bitwise_and(image, image, mask=mask)
|
607 |
-
except Exception as e:
|
608 |
-
logger.error(f"FAILURE Background removal failed: {str(e)}")
|
609 |
-
return None
|
610 |
-
|
611 |
-
# --- PDFProcessor functions ---
|
612 |
-
pdf_temp_dir = Path(tempfile.mkdtemp())
|
613 |
-
pdf_processing_settings = {
|
614 |
-
'dpi': 300, 'scale_factor': 2, 'split_pages': True, 'extract_images': True, 'batch_size': 2
|
615 |
-
}
|
616 |
-
|
617 |
-
def process_pdf_document(pdf_path: str, settings: Dict[str, Any], progress=None):
|
618 |
-
global pdf_temp_dir, pdf_processing_settings
|
619 |
-
logger.info(f"START Processing PDF: {pdf_path}")
|
620 |
-
all_images = []
|
621 |
-
|
622 |
-
if not os.path.exists(pdf_path):
|
623 |
-
logger.error(f"FAILURE PDF file not found: {pdf_path}")
|
624 |
-
return []
|
625 |
-
|
626 |
-
try:
|
627 |
-
doc = fitz.open(pdf_path)
|
628 |
-
total_pages = len(doc)
|
629 |
-
batch_size = settings.get('batch_size', pdf_processing_settings['batch_size'])
|
630 |
-
batches = [range(i, min(i + batch_size, total_pages)) for i in range(0, total_pages, batch_size)]
|
631 |
-
|
632 |
-
with tqdm(total=total_pages, desc="📄 Processing PDF") as pbar:
|
633 |
-
for batch in batches:
|
634 |
-
with ThreadPoolExecutor(max_workers=batch_size) as executor:
|
635 |
-
futures = {executor.submit(_process_pdf_page, doc, page_num, settings, progress): page_num
|
636 |
-
for page_num in batch}
|
637 |
-
for future in as_completed(futures):
|
638 |
-
result = future.result()
|
639 |
-
if result and isinstance(result, list):
|
640 |
-
all_images.extend(result)
|
641 |
-
pbar.update(1)
|
642 |
-
|
643 |
-
logger.info(f"SUCCESS Extracted {len(all_images)} images")
|
644 |
-
return all_images
|
645 |
-
except Exception as e:
|
646 |
-
logger.error(f"FAILURE PDF processing failed: {str(e)}")
|
647 |
-
return []
|
648 |
-
|
649 |
-
def _process_pdf_page(doc, page_num: int, settings: Dict[str, Any], progress=None):
|
650 |
-
images = []
|
651 |
-
try:
|
652 |
-
page = doc.load_page(page_num)
|
653 |
-
pix = page.get_pixmap(matrix=fitz.Matrix(settings.get('scale_factor', 2), settings.get('scale_factor', 2)))
|
654 |
-
if pix.n <= 0 or pix.width <= 0 or pix.height <= 0 or not pix.samples:
|
655 |
-
logger.error(f"FAILURE Invalid pixmap data for page {page_num + 1}")
|
656 |
-
return []
|
657 |
-
|
658 |
-
img_data = np.frombuffer(pix.samples, dtype=np.uint8)
|
659 |
-
expected_size = pix.width * pix.height * pix.n
|
660 |
-
if img_data.size != expected_size:
|
661 |
-
logger.error(f"FAILURE Pixmap data size mismatch for page {page_num + 1}: expected {expected_size}, got {img_data.size}")
|
662 |
-
return []
|
663 |
-
|
664 |
-
img = img_data.reshape(pix.height, pix.width, pix.n)
|
665 |
-
logger.debug(f"Image shape from pixmap: {img.shape}")
|
666 |
-
processed_img, _ = enhance_for_persian(img, settings, progress)
|
667 |
-
if processed_img is not None:
|
668 |
-
images.append(processed_img)
|
669 |
-
|
670 |
-
if settings.get('extract_images', True):
|
671 |
-
for img_info in page.get_images(full=True):
|
672 |
-
xref = img_info[0]
|
673 |
-
try:
|
674 |
-
base_image = fitz.Pixmap(doc, xref)
|
675 |
-
if base_image.n >= 4:
|
676 |
-
base_image = fitz.Pixmap(fitz.csRGB, base_image)
|
677 |
-
if base_image.n <= 0 or base_image.width <= 0 or base_image.height <= 0 or not base_image.samples:
|
678 |
-
logger.warning(f"WARNING Invalid extracted pixmap for page {page_num + 1}, skipping")
|
679 |
-
continue
|
680 |
-
img_array = np.frombuffer(base_image.samples, dtype=np.uint8).reshape(
|
681 |
-
base_image.height, base_image.width, 3 if base_image.n >= 3 else 1)
|
682 |
-
if img_array.shape[0] > 100 and img_array.shape[1] > 100:
|
683 |
-
processed_img_extracted, _ = enhance_for_persian(img_array, settings, progress)
|
684 |
-
if processed_img_extracted is not None:
|
685 |
-
images.append(processed_img_extracted)
|
686 |
-
except Exception as e:
|
687 |
-
logger.warning(f"WARNING Failed to process extracted image for page {page_num + 1}: {str(e)}")
|
688 |
-
continue
|
689 |
-
if progress:
|
690 |
-
progress(1.0)
|
691 |
-
return images
|
692 |
-
except Exception as e:
|
693 |
-
logger.error(f"FAILURE Page {page_num + 1} processing failed: {str(e)}")
|
694 |
-
if progress:
|
695 |
-
progress(1.0)
|
696 |
-
return []
|
697 |
-
|
698 |
-
def optimize_pdf_document(pdf_path: str, settings: Dict[str, Any], progress=None):
|
699 |
-
global pdf_temp_dir, default_preprocessing_settings
|
700 |
-
logger.info(f"START Optimizing PDF: {pdf_path}")
|
701 |
-
logger.debug(f"PDF Path for optimization: {pdf_path}")
|
702 |
-
if not os.path.exists(pdf_path):
|
703 |
-
logger.error(f"FAILURE PDF file not found: {pdf_path}")
|
704 |
-
return pdf_path
|
705 |
-
|
706 |
-
try:
|
707 |
-
output_path = Path(f"/app/optimized_{Path(pdf_path).name}")
|
708 |
-
doc = fitz.open(pdf_path)
|
709 |
-
new_doc = fitz.open()
|
710 |
-
|
711 |
-
total_pages = len(doc)
|
712 |
-
for page_num in tqdm(range(total_pages), desc="📄 Optimizing PDF"):
|
713 |
-
page = doc.load_page(page_num)
|
714 |
-
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
|
715 |
-
if pix.n <= 0 or pix.width <= 0 or pix.height <= 0 or not pix.samples:
|
716 |
-
logger.error(f"FAILURE Invalid pixmap data for page {page_num + 1}")
|
717 |
-
continue
|
718 |
-
|
719 |
-
img_data = np.frombuffer(pix.samples, dtype=np.uint8)
|
720 |
-
expected_size = pix.width * pix.height * pix.n
|
721 |
-
if img_data.size != expected_size:
|
722 |
-
logger.error(f"FAILURE Pixmap data size mismatch for page {page_num + 1}: expected {expected_size}, got {img_data.size}")
|
723 |
-
continue
|
724 |
-
|
725 |
-
img = img_data.reshape(pix.height, pix.width, pix.n)
|
726 |
-
logger.debug(f"Image shape from pixmap: {img.shape}")
|
727 |
-
processed_img, _ = enhance_for_persian(img, settings, progress)
|
728 |
-
if processed_img is None:
|
729 |
-
continue
|
730 |
-
|
731 |
-
img_path = pdf_temp_dir / f"temp_page_{page_num}.jpg"
|
732 |
-
cv2.imwrite(str(img_path), processed_img)
|
733 |
-
temp_doc = fitz.open(str(img_path))
|
734 |
-
new_doc.insert_pdf(temp_doc)
|
735 |
-
temp_doc.close()
|
736 |
-
os.remove(img_path)
|
737 |
-
|
738 |
-
new_doc.save(str(output_path))
|
739 |
-
logger.info(f"SUCCESS PDF optimized: {output_path}")
|
740 |
-
if progress:
|
741 |
-
progress(1.0)
|
742 |
-
return str(output_path)
|
743 |
-
except Exception as e:
|
744 |
-
logger.error(f"FAILURE PDF optimization failed: {str(e)}")
|
745 |
-
if progress:
|
746 |
-
progress(1.0)
|
747 |
-
return pdf_path
|
748 |
-
|
749 |
-
def cleanup_pdf_temp_dir():
|
750 |
-
global pdf_temp_dir
|
751 |
-
try:
|
752 |
-
shutil.rmtree(pdf_temp_dir)
|
753 |
-
except Exception as e:
|
754 |
-
logger.error(f"FAILURE Temp cleanup failed: {str(e)}")
|
755 |
-
|
756 |
-
# --- Cache functions ---
|
757 |
-
cache_data = OrderedDict()
|
758 |
-
cache_max_size = 1000
|
759 |
-
cache_lock = threading.Lock()
|
760 |
-
cache_dir_path = Path("/app/cache")
|
761 |
-
|
762 |
-
def setup_cache_dir():
|
763 |
-
global cache_dir_path
|
764 |
-
cache_dir_path.mkdir(exist_ok=True)
|
765 |
-
|
766 |
-
def _get_cache_key(image: np.ndarray):
|
767 |
-
return hashlib.md5(image.tobytes()).hexdigest()
|
768 |
-
|
769 |
-
def get_cache(image: np.ndarray):
|
770 |
-
global cache_data, cache_lock
|
771 |
-
key = _get_cache_key(image)
|
772 |
-
with cache_lock:
|
773 |
-
if key in cache_data:
|
774 |
-
value = cache_data.pop(key)
|
775 |
-
cache_data[key] = value
|
776 |
-
return pickle.loads(value)
|
777 |
-
return None
|
778 |
-
|
779 |
-
def set_cache(image: np.ndarray, result: OCRResult):
|
780 |
-
global cache_data, cache_max_size, cache_lock
|
781 |
-
key = _get_cache_key(image)
|
782 |
-
with cache_lock:
|
783 |
-
if len(cache_data) >= cache_max_size:
|
784 |
-
cache_data.popitem(last=False)
|
785 |
-
cache_data[key] = pickle.dumps(result)
|
786 |
-
|
787 |
-
# --- PersianOCR functions (Main logic) ---
|
788 |
-
default_ocr_settings = {
|
789 |
-
'resize': True, 'resize_scale': 200, 'enhance_contrast': True, 'reduce_noise': True,
|
790 |
-
'sharpen': True, 'deskew': True, 'optimize_for_ocr': True, 'extract_images': True,
|
791 |
-
'cache_enabled': True, 'max_workers': 4
|
792 |
-
}
|
793 |
-
|
794 |
-
def process_single_image(image: Union[str, np.ndarray], settings: Optional[Dict] = None, progress=None):
|
795 |
-
global default_ocr_settings
|
796 |
-
start_time = time.time()
|
797 |
-
current_settings = default_ocr_settings.copy()
|
798 |
-
if settings:
|
799 |
-
current_settings.update(settings)
|
800 |
-
|
801 |
-
try:
|
802 |
-
if isinstance(image, str):
|
803 |
-
logger.debug(f"Loading image from path: {image}")
|
804 |
-
image = cv2.imread(image)
|
805 |
-
if image is None:
|
806 |
-
logger.error(f"FAILURE Failed to load image from path: {image}")
|
807 |
-
return OCRResult([], [], 0.0, "None", 0.0, "Unknown", "Unknown", 0, 0, {}, 0.0)
|
808 |
-
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
809 |
-
logger.debug(f"Image loaded with shape: {image.shape}")
|
810 |
-
|
811 |
-
if current_settings.get('cache_enabled'):
|
812 |
-
cached_result = get_cache(image)
|
813 |
-
if cached_result:
|
814 |
-
logger.info("SUCCESS Cache hit")
|
815 |
-
return cached_result
|
816 |
-
|
817 |
-
result = process_image(image, progress)
|
818 |
-
if result and result.text:
|
819 |
-
if current_settings.get('cache_enabled'):
|
820 |
-
set_cache(image, result)
|
821 |
-
return result
|
822 |
-
|
823 |
-
processed_image, preprocess_info = enhance_for_persian(image, current_settings, progress)
|
824 |
-
if processed_image is None:
|
825 |
-
return OCRResult([], [], 0.0, "None", time.time() - start_time, "Unknown", "Unknown", 0, 0, preprocess_info, 0.0)
|
826 |
-
|
827 |
-
result = process_image(processed_image, progress)
|
828 |
-
if result:
|
829 |
-
result = result._replace(preprocessing_info=preprocess_info)
|
830 |
-
if current_settings.get('cache_enabled'):
|
831 |
-
set_cache(image, result)
|
832 |
-
return result
|
833 |
-
|
834 |
-
preprocess_info = preprocess_info if 'preprocess_info' in locals() else {}
|
835 |
-
return OCRResult([], [], 0.0, "None", time.time() - start_time, "Unknown", "Unknown", 0, 0, preprocess_info, 0.0)
|
836 |
-
|
837 |
-
except Exception as e:
|
838 |
-
logger.error(f"FAILURE Image processing failed: {str(e)}")
|
839 |
-
return OCRResult([], [], 0.0, "Error", time.time() - start_time, "Unknown", "Unknown", 0, 0, {}, 0.0)
|
840 |
-
|
841 |
-
def process_pdf(pdf_path: str, settings: Optional[Dict] = None, progress=None):
|
842 |
-
global default_ocr_settings
|
843 |
-
current_settings = default_ocr_settings.copy()
|
844 |
-
if settings:
|
845 |
-
current_settings.update(settings)
|
846 |
-
|
847 |
-
logger.info(f"START Processing PDF: {pdf_path}")
|
848 |
-
results = []
|
849 |
-
|
850 |
-
if not pdf_path or not os.path.exists(pdf_path):
|
851 |
-
logger.error(f"FAILURE PDF file not found or invalid: {pdf_path}")
|
852 |
-
return [OCRResult([], [], 0.0, "None", 0.0, "Unknown", "Unknown", 0, 0, {}, 0.0)]
|
853 |
-
|
854 |
-
try:
|
855 |
-
optimized_pdf = optimize_pdf_document(pdf_path, current_settings, progress)
|
856 |
-
if not optimized_pdf or not os.path.exists(optimized_pdf):
|
857 |
-
logger.error(f"FAILURE Optimized PDF not generated: {optimized_pdf}")
|
858 |
-
return [OCRResult([], [], 0.0, "None", 0.0, "Unknown", "Unknown", 0, 0, {}, 0.0)]
|
859 |
-
|
860 |
-
images = process_pdf_document(optimized_pdf, current_settings, progress)
|
861 |
-
if not images:
|
862 |
-
logger.warning("WARNING No images extracted from PDF")
|
863 |
-
return [OCRResult([], [], 0.0, "None", 0.0, "Unknown", "Unknown", 0, 0, {}, 0.0)]
|
864 |
-
|
865 |
-
with ThreadPoolExecutor(max_workers=current_settings.get('max_workers')) as executor:
|
866 |
-
futures = [executor.submit(process_single_image, img, current_settings, progress) for img in images if img is not None]
|
867 |
-
for future in as_completed(futures):
|
868 |
-
result = future.result()
|
869 |
-
if result and result.text:
|
870 |
-
results.append(result)
|
871 |
-
|
872 |
-
logger.info(f"SUCCESS Processed {len(results)} pages")
|
873 |
-
return results if results else [OCRResult([], [], 0.0, "None", 0.0, "Unknown", "Unknown", 0, 0, {}, 0.0)]
|
874 |
-
except Exception as e:
|
875 |
-
logger.error(f"FAILURE PDF processing failed: {str(e)}")
|
876 |
-
return [OCRResult([], [], 0.0, "Error", 0.0, "Unknown", "Unknown", 0, 0, {}, 0.0)]
|
877 |
-
|
878 |
-
# --- رابط کاربری Gradio ---
|
879 |
-
def create_gradio_interface():
|
880 |
-
def process_file(file, use_cache: bool, preprocessing: bool, confidence: float, scale: int,
|
881 |
-
enhance_contrast: bool, reduce_noise: bool, extract_images: bool):
|
882 |
-
if file is None:
|
883 |
-
logger.error("FAILURE No file provided")
|
884 |
-
return ("", "", "0.0", "None", "0.0", "Unknown", "No file uploaded")
|
885 |
-
|
886 |
-
settings = {
|
887 |
-
'cache_enabled': use_cache, 'preprocessing_enabled': preprocessing, 'confidence_threshold': confidence,
|
888 |
-
'resize': True, 'resize_scale': scale, 'enhance_contrast': enhance_contrast,
|
889 |
-
'reduce_noise': reduce_noise, 'extract_images': extract_images, 'sharpen': True, 'deskew': True,
|
890 |
-
'optimize_for_ocr': True
|
891 |
-
}
|
892 |
-
|
893 |
-
progress = gr.Progress(track_tqdm=True)
|
894 |
-
|
895 |
-
try:
|
896 |
-
if file.name.lower().endswith('.pdf'):
|
897 |
-
results = process_pdf(file.name, settings, progress)
|
898 |
-
full_text = ""
|
899 |
-
numbers_combined = []
|
900 |
-
confidences = []
|
901 |
-
models_used = []
|
902 |
-
times = []
|
903 |
-
qualities = []
|
904 |
-
preprocess_infos = []
|
905 |
-
|
906 |
-
for res in results:
|
907 |
-
full_text += "\n" + " ".join(res.text)
|
908 |
-
numbers_combined.extend(res.numbers)
|
909 |
-
confidences.append(f"{res.confidence:.2f}")
|
910 |
-
models_used.append(res.model_name)
|
911 |
-
times.append(f"{res.processing_time:.2f} seconds")
|
912 |
-
qualities.append(res.image_quality)
|
913 |
-
preprocess_infos.append("\n".join([f"{k}: {v}" for k, v in res.preprocessing_info.items()]))
|
914 |
-
|
915 |
-
combined_preprocess_info = "\nPage-wise Preprocessing Info:\n" + "\n\n".join(preprocess_infos) if preprocess_infos else ""
|
916 |
-
|
917 |
-
return (
|
918 |
-
full_text.strip(),
|
919 |
-
", ".join(numbers_combined),
|
920 |
-
", ".join(confidences),
|
921 |
-
", ".join(models_used),
|
922 |
-
", ".join(times),
|
923 |
-
", ".join(qualities),
|
924 |
-
combined_preprocess_info
|
925 |
-
)
|
926 |
-
|
927 |
-
else:
|
928 |
-
result = process_single_image(file.name, settings, progress)
|
929 |
-
if result and result.text:
|
930 |
-
preprocess_info = "\n".join([f"{k}: {v}" for k, v in result.preprocessing_info.items()]) if result.preprocessing_info else ""
|
931 |
-
return (
|
932 |
-
"\n".join(result.text),
|
933 |
-
", ".join(result.numbers),
|
934 |
-
f"{result.confidence:.2f}",
|
935 |
-
result.model_name,
|
936 |
-
f"{result.processing_time:.2f} seconds",
|
937 |
-
result.image_quality,
|
938 |
-
preprocess_info
|
939 |
-
)
|
940 |
-
return ("", "", "0.0", "None", "0.0", "Unknown", "No text extracted")
|
941 |
-
|
942 |
-
except Exception as e:
|
943 |
-
logger.error(f"FAILURE Interface error: {str(e)}")
|
944 |
-
return ("", "", "0.0", "Error", "0.0", "Unknown", str(e))
|
945 |
-
|
946 |
-
with gr.Blocks(title="سیستم OCR فارسی پیشرفته") as interface:
|
947 |
-
gr.Markdown("# سیستم OCR فارسی پیشرفته")
|
948 |
-
with gr.Row():
|
949 |
-
with gr.Column():
|
950 |
-
file_input = gr.File(label="آپلود فایل (تصویر یا PDF)")
|
951 |
-
with gr.Accordion("تنظیمات پیشرفته", open=False):
|
952 |
-
use_cache = gr.Checkbox(label="استفاده از کش (Cache)", value=True)
|
953 |
-
preprocessing = gr.Checkbox(label="فعالسازی پیشپردازش", value=True)
|
954 |
-
confidence = gr.Slider(0.1, 1.0, value=0.7, label="آستانه اطمینان (Confidence Threshold)")
|
955 |
-
scale = gr.Slider(100, 400, value=200, step=50, label="مقیاس تصویر (%)")
|
956 |
-
enhance_contrast = gr.Checkbox(label="بهبود کنتراست", value=True)
|
957 |
-
reduce_noise = gr.Checkbox(label="کاهش نویز", value=True)
|
958 |
-
extract_images = gr.Checkbox(label="استخراج تصاویر از PDF", value=True)
|
959 |
-
submit_btn = gr.Button("پردازش متن")
|
960 |
-
with gr.Column():
|
961 |
-
outputs = [
|
962 |
-
gr.Textbox(label="متن استخراجشده", lines=10),
|
963 |
-
gr.Textbox(label="اعداد استخراجشده", lines=2),
|
964 |
-
gr.Textbox(label="میزان اطمینان (Confidence)"),
|
965 |
-
gr.Textbox(label="مدل OCR استفادهشده"),
|
966 |
-
gr.Textbox(label="زمان پردازش"),
|
967 |
-
gr.Textbox(label="کیفیت تصویر"),
|
968 |
-
gr.Textbox(label="اطلاعات پیشپردازش", lines=5)
|
969 |
-
]
|
970 |
-
|
971 |
-
submit_btn.click(
|
972 |
-
fn=process_file,
|
973 |
-
inputs=[file_input, use_cache, preprocessing, confidence, scale, enhance_contrast, reduce_noise, extract_images],
|
974 |
-
outputs=outputs
|
975 |
-
)
|
976 |
-
return interface
|
977 |
-
|
978 |
-
# --- تابع اصلی ---
|
979 |
-
def main():
|
980 |
-
try:
|
981 |
-
logger.info("START Initializing system")
|
982 |
-
setup_system_dependencies() # نصب وابستگیهای سیستمی
|
983 |
-
os.makedirs('/app/logs', exist_ok=True)
|
984 |
-
os.makedirs('/app/cache', exist_ok=True)
|
985 |
-
setup_cache_dir()
|
986 |
-
|
987 |
-
device = "GPU" if torch.cuda.is_available() else "CPU"
|
988 |
-
logger.info(f"SUCCESS Using {device}")
|
989 |
-
|
990 |
-
# اجرای Gradio برای Hugging Face Spaces
|
991 |
-
interface = create_gradio_interface()
|
992 |
-
interface.launch(server_name="0.0.0.0", server_port=7860, share=False)
|
993 |
-
except Exception as e:
|
994 |
-
logger.error(f"FAILURE Main error: {str(e)}")
|
995 |
-
raise
|
996 |
-
|
997 |
-
if __name__ == "__main__":
|
998 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|