Really-amin commited on
Commit
647aae6
·
verified ·
1 Parent(s): 64bb3c8

Delete app/persian_ocr/app.py

Browse files
Files changed (1) hide show
  1. app/persian_ocr/app.py +0 -998
app/persian_ocr/app.py DELETED
@@ -1,998 +0,0 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
-
4
- import os
5
- import subprocess
6
- os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
7
- import tensorflow as tf
8
- tf.get_logger().setLevel('ERROR')
9
-
10
- import logging
11
- from logging.handlers import RotatingFileHandler
12
- import pytesseract
13
- import easyocr
14
- import cv2
15
- import numpy as np
16
- from PIL import Image
17
- import re
18
- from typing import Dict, List, Optional, Any, Union, Tuple
19
- from hazm import Normalizer, word_tokenize
20
- import json
21
- from datetime import datetime
22
- from transformers import pipeline, TrOCRProcessor, VisionEncoderDecoderModel
23
- import gradio as gr
24
- import fitz
25
- from tqdm import tqdm
26
- from dataclasses import dataclass
27
- from functools import lru_cache
28
- import threading
29
- from concurrent.futures import ThreadPoolExecutor, as_completed
30
- import tempfile
31
- import shutil
32
- from pathlib import Path
33
- import hashlib
34
- import pickle
35
- from collections import OrderedDict
36
- import time
37
- import torch
38
- import psutil
39
- import warnings
40
- import io
41
-
42
- warnings.filterwarnings('ignore')
43
- from collections import namedtuple
44
-
45
- # --- ثابت‌های سیستم ---
46
- TESSERACT_CMD = '/usr/bin/tesseract'
47
- TESSDATA_PREFIX = '/usr/share/tesseract-ocr/4.00/tessdata'
48
- SUCCESS = "✅"
49
- FAILURE = "❌"
50
- PROCESSING = "🔄"
51
-
52
- # --- نصب وابستگی‌های سیستمی برای Hugging Face Spaces ---
53
- def setup_system_dependencies():
54
- logger.info("START: نصب وابستگی‌های سیستمی برای Hugging Face Spaces")
55
- try:
56
- # بررسی و نصب Tesseract و زبان فارسی
57
- if not os.path.isfile(TESSERACT_CMD):
58
- logger.info("PROCESSING: نصب Tesseract OCR")
59
- subprocess.run(['apt-get', 'update'], check=True)
60
- subprocess.run(['apt-get', 'install', '-y', 'tesseract-ocr'], check=True)
61
- subprocess.run(['apt-get', 'install', '-y', 'tesseract-ocr-fas'], check=True)
62
-
63
- # نصب OpenCV و Fontconfig (برای Matplotlib و Fontconfig)
64
- subprocess.run(['apt-get', 'install', '-y', 'libopencv-dev'], check=True)
65
- subprocess.run(['apt-get', 'install', '-y', 'fontconfig'], check=True)
66
-
67
- # تنظیم مسیر Tesseract
68
- pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD
69
- os.environ['TESSDATA_PREFIX'] = TESSDATA_PREFIX
70
- logger.info("SUCCESS: وابستگی‌های سیستمی نصب شدند")
71
- except Exception as e:
72
- logger.error(f"FAILURE: خطا در نصب وابستگی‌های سیستمی: {str(e)}")
73
- raise
74
-
75
- # --- توابع Persian-OCR (از detect.py) ---
76
- def get_grayscale(image):
77
- if image is None or not isinstance(image, np.ndarray) or image.size == 0 or len(image.shape) < 2:
78
- return None
79
- return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
80
-
81
- def remove_noise(image):
82
- if image is None or not isinstance(image, np.ndarray) or image.size == 0 or len(image.shape) < 2:
83
- return None
84
- return cv2.medianBlur(image, 3)
85
-
86
- def thresholding(image):
87
- if image is None or not isinstance(image, np.ndarray) or image.size == 0 or len(image.shape) < 2:
88
- return None, None
89
- return cv2.threshold(image, 160, 255, cv2.THRESH_BINARY)[1]
90
-
91
- def dilate(image):
92
- if image is None or not isinstance(image, np.ndarray) or image.size == 0 or len(image.shape) < 2:
93
- return None
94
- kernel = np.ones((5, 5), np.uint8)
95
- return cv2.dilate(image, kernel, iterations=1)
96
-
97
- def erode(image):
98
- if image is None or not isinstance(image, np.ndarray) or image.size == 0 or len(image.shape) < 2:
99
- return None
100
- kernel = np.ones((5, 5), np.uint8)
101
- return cv2.erode(image, kernel, iterations=1)
102
-
103
- def opening(image):
104
- if image is None or not isinstance(image, np.ndarray) or image.size == 0 or len(image.shape) < 2:
105
- return None
106
- kernel = np.ones((5, 5), np.uint8)
107
- return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)
108
-
109
- def canny(image):
110
- if image is None or not isinstance(image, np.ndarray) or image.size == 0 or len(image.shape) < 2:
111
- return None
112
- return cv2.Canny(image, 100, 200)
113
-
114
- def deskew(image):
115
- if image is None or not isinstance(image, np.ndarray) or image.size == 0 or len(image.shape) < 2:
116
- return None
117
- coords = np.column_stack(np.where(image > 0))
118
- if coords.size == 0:
119
- return image
120
- angle = cv2.minAreaRect(coords)[-1]
121
- if angle < -45:
122
- angle = -(90 + angle)
123
- else:
124
- angle = -angle
125
- (h, w) = image.shape[:2]
126
- center = (w // 2, h // 2)
127
- M = cv2.getRotationMatrix2D(center, angle, 1.0)
128
- rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
129
- return rotated
130
-
131
- def match_template(image, template):
132
- if image is None or template is None or not isinstance(image, np.ndarray) or not isinstance(template, np.ndarray):
133
- return None
134
- return cv2.matchTemplate(image, template, cv2.TM_CCOEFF_NORMED)
135
-
136
- def persian_ocr_main(image: np.ndarray, langs="fa", mode="tn"):
137
- if image is None or not isinstance(image, np.ndarray) or image.size == 0 or len(image.shape) < 2:
138
- return ""
139
- with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_input:
140
- cv2.imwrite(temp_input.name, image)
141
- temp_input_path = temp_input.name
142
-
143
- with tempfile.NamedTemporaryFile(suffix='.txt', delete=False) as temp_output:
144
- temp_output_path = temp_output.name
145
-
146
- im = Image.open(temp_input_path)
147
- length_x, width_y = im.size
148
- factor = float(1024.0 / length_x)
149
- size = int(factor * length_x), int(factor * width_y)
150
- image_resize = im.resize(size, Image.Resampling.LANCZOS)
151
- image_resize.save(f"{temp_input_path}_Upscaled.png", dpi=(300, 300))
152
- img = cv2.imread(f"{temp_input_path}_Upscaled.png")
153
- gray = get_grayscale(img)
154
- if gray is None:
155
- os.remove(temp_input_path)
156
- os.remove(temp_output_path)
157
- return ""
158
- img = gray # فقط grayscale فعلاً فعاله
159
- if langs == "fa":
160
- if mode == "t":
161
- custom_config = r'-l fas --psm 6 -c tessedit_char_blacklist="۰١۲۳۴۵۶۷۸۹«»1234567890#"'
162
- elif mode == "tn":
163
- custom_config = r'-l fas --psm 6 -c tessedit_char_whitelist="آابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی ۰١۲۳۴۵۶۷۸۹.?!,،:;/"'
164
- elif mode == "table":
165
- custom_config = r'-l fas --psm 6 -c tessedit_char_whitelist="آابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی۰١۲۳۴۵۶۷۸۹"'
166
- elif langs == "en":
167
- custom_config = r'-l eng --psm 6'
168
- elif langs == "faen":
169
- custom_config = r'-l fas+eng --psm 6'
170
- else:
171
- raise ValueError("Choose valid language options.")
172
-
173
- text = pytesseract.image_to_string(img, config=custom_config)
174
- with io.open(temp_output_path, 'w', encoding='utf8') as f:
175
- f.write(text)
176
-
177
- os.remove(temp_input_path)
178
- os.remove(f"{temp_input_path}_Upscaled.png")
179
- os.remove(temp_output_path)
180
- return text
181
-
182
- # --- پیکربندی گزارش‌گیری ---
183
- class CustomFormatter(logging.Formatter):
184
- grey = "\x1b[38;21m"
185
- blue = "\x1b[38;5;39m"
186
- yellow = "\x1b[38;5;226m"
187
- red = "\x1b[38;5;196m"
188
- bold_red = "\x1b[31;1m"
189
- reset = "\x1b[0m"
190
- STATUS_EMOJI = {
191
- 'START': '🟦', 'SUCCESS': '✅', 'FAILURE': '❌', 'LOADING': '⏳',
192
- 'PROCESSING': '🔄', 'WARNING': '⚠️', 'MEMORY': '💾'
193
- }
194
- def __init__(self, fmt):
195
- super().__init__()
196
- self.fmt = fmt
197
- self.FORMATS = {
198
- logging.DEBUG: self.grey + self.fmt + self.reset,
199
- logging.INFO: self.blue + self.fmt + self.reset,
200
- logging.WARNING: self.yellow + self.fmt + self.reset,
201
- logging.ERROR: self.red + self.fmt + self.reset,
202
- logging.CRITICAL: self.bold_red + self.fmt + self.reset
203
- }
204
- def format(self, record):
205
- log_fmt = self.FORMATS.get(record.levelno)
206
- formatter = logging.Formatter(log_fmt)
207
- memory_usage = psutil.virtual_memory().percent
208
- record.msg = f"{record.msg} [Mem: {memory_usage:.1f}%]"
209
- for status, emoji in self.STATUS_EMOJI.items():
210
- if status in record.msg:
211
- record.msg = f"{emoji} {record.msg}"
212
- return formatter.format(record)
213
-
214
- logger = logging.getLogger(__name__)
215
- logger.setLevel(logging.DEBUG)
216
- logs_dir = "/app/logs"
217
- os.makedirs(logs_dir, exist_ok=True)
218
- log_file = os.path.join(logs_dir, "ocr.log")
219
- file_handler = RotatingFileHandler(log_file, maxBytes=10*1024*1024, backupCount=5, encoding='utf-8')
220
- file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
221
- console_handler = logging.StreamHandler()
222
- console_handler.setFormatter(CustomFormatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
223
- logger.addHandler(file_handler)
224
- logger.addHandler(console_handler)
225
-
226
- # --- Dataclass OCRResult ---
227
- OCRResult = namedtuple('OCRResult', ['text', 'numbers', 'confidence', 'model_name', 'processing_time',
228
- 'image_quality', 'detected_language', 'word_count', 'char_count',
229
- 'preprocessing_info', 'error_rate'])
230
-
231
- # --- مدیریت مدل ---
232
- models = {}
233
- model_performance = {
234
- 'pretrained_model': {'success': 0, 'fail': 0},
235
- 'mT5_OCR_fa': {'success': 0, 'fail': 0},
236
- 'LayoutLMv3_fa': {'success': 0, 'fail': 0},
237
- 'easyocr': {'success': 0, 'fail': 0},
238
- 'tesseract': {'success': 0, 'fail': 0},
239
- 'persian_ocr': {'success': 0, 'fail': 0}
240
- }
241
- model_configs = {
242
- 'pretrained_model': {'name': "beheshti-ai/TrOCR-fa", 'type': "transformer", 'threshold': 0.8, 'device': "cpu"},
243
- 'mT5_OCR_fa': {'name': "aleemeconomist/mT5-OCR-fa", 'type': "image-to-text", 'threshold': 0.7, 'device': "cpu"},
244
- 'LayoutLMv3_fa': {'name': "SoheilStar/LayoutLMv3-fa", 'type': "document-question-answering", 'threshold': 0.7, 'device': "cpu"},
245
- 'persian_ocr': {'name': "Persian-OCR", 'type': "custom", 'threshold': 0.75, 'device': "cpu"}
246
- }
247
- model_priority = ['pretrained_model', 'mT5_OCR_fa', 'LayoutLMv3_fa', 'easyocr', 'tesseract', 'persian_ocr']
248
- model_lock = threading.Lock()
249
- normalizer = Normalizer()
250
-
251
- def load_model(model_name: str, progress=None):
252
- global models, model_performance, model_configs, model_lock
253
- with model_lock:
254
- if model_name in models:
255
- return True
256
-
257
- logger.info(f"START Loading model: {model_name}")
258
- try:
259
- config = model_configs.get(model_name)
260
- if config:
261
- if config['type'] == "image-to-text":
262
- models[model_name] = pipeline(config['type'], model=config['name'], device=config['device'])
263
- elif config['type'] == "document-question-answering":
264
- models[model_name] = pipeline(config['type'], model=config['name'], device=config['device'])
265
- elif config['type'] == "transformer":
266
- if progress:
267
- progress(0.3)
268
- processor = TrOCRProcessor.from_pretrained(config['name'])
269
- if progress:
270
- progress(0.6)
271
- model_instance = VisionEncoderDecoderModel.from_pretrained(config['name'])
272
- models[model_name] = {'processor': processor, 'model': model_instance, 'device': config['device']}
273
- elif config['type'] == "custom" and model_name == "persian_ocr":
274
- models[model_name] = True # نیازی به بارگذاری خاص نداره
275
- elif model_name == "easyocr":
276
- if progress:
277
- progress(0.5)
278
- models[model_name] = easyocr.Reader(['fa', 'en'], gpu=torch.cuda.is_available())
279
- elif model_name == "tesseract":
280
- # تنظیم Tesseract برای Hugging Face Spaces
281
- pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD
282
- os.environ['TESSDATA_PREFIX'] = TESSDATA_PREFIX
283
- models[model_name] = True
284
-
285
- logger.info(f"SUCCESS Model {model_name} loaded")
286
- model_performance[model_name]['success'] += 1
287
- if progress:
288
- progress(1.0)
289
- return True
290
- except Exception as e:
291
- logger.error(f"FAILURE Error loading {model_name}: {str(e)}")
292
- model_performance[model_name]['fail'] += 1
293
- if progress:
294
- progress(1.0)
295
- return False
296
-
297
- def process_image(image: np.ndarray, progress=None):
298
- global models, model_performance, model_priority, model_configs
299
- start_time = time.time()
300
- if progress:
301
- progress(0.0)
302
-
303
- if image is None or not isinstance(image, np.ndarray) or image.size == 0 or len(image.shape) < 2:
304
- logger.error("FAILURE Input image to process_image is invalid or empty")
305
- raise ValueError("Input image is invalid or empty")
306
-
307
- logger.debug(f"Processing image with shape: {image.shape}")
308
-
309
- sorted_models_priority = sorted(
310
- model_priority,
311
- key=lambda x: model_performance[x]['success'] / (model_performance[x]['fail'] + 1),
312
- reverse=True
313
- )
314
-
315
- for i, model_name in enumerate(sorted_models_priority):
316
- try:
317
- if not load_model(model_name, progress):
318
- continue
319
-
320
- if progress:
321
- progress((i + 1) / len(sorted_models_priority))
322
-
323
- result_dict = None
324
- config = model_configs.get(model_name)
325
- if model_name in model_configs:
326
- if config['type'] == "transformer":
327
- result_dict = _process_transformer_model_full(image, model_name)
328
- elif config['type'] == "image-to-text":
329
- result_dict = _process_transformer_model(image, model_name)
330
- elif config['type'] == "document-question-answering":
331
- result_dict = _process_transformer_model(image, model_name)
332
- elif model_name == 'persian_ocr':
333
- result_dict = _process_persian_ocr(image)
334
- elif model_name == 'easyocr':
335
- result_dict = _process_easyocr(image)
336
- elif model_name == 'tesseract':
337
- result_dict = _process_tesseract(image)
338
-
339
- if result_dict and 'text' in result_dict and result_dict['text'].strip():
340
- processing_time = time.time() - start_time
341
- ocr_result = _format_result(
342
- result_dict['text'],
343
- result_dict.get('confidence', 0.5),
344
- model_name,
345
- processing_time
346
- )
347
-
348
- threshold = model_configs.get(model_name, {}).get('threshold', 0.5)
349
- if ocr_result.confidence >= threshold:
350
- logger.info(f"SUCCESS Model {model_name} succeeded")
351
- if progress:
352
- progress(1.0)
353
- return ocr_result
354
- except Exception as e:
355
- logger.warning(f"WARNING Model {model_name} failed: {str(e)}")
356
- continue
357
-
358
- logger.warning("WARNING No model succeeded")
359
- if progress:
360
- progress(1.0)
361
- return None
362
-
363
- def _process_transformer_model(image: np.ndarray, model_name: str):
364
- global models, model_configs
365
- pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
366
- model = models[model_name]
367
-
368
- if model_configs[model_name]['type'] == "image-to-text":
369
- result = model(pil_image)[0]
370
- return {'text': result['generated_text'], 'confidence': model_configs[model_name]['threshold']}
371
- else: # "document-question-answering"
372
- result = model(pil_image)
373
- return {'text': result['answer'], 'confidence': model_configs[model_name]['threshold']}
374
-
375
- def _process_transformer_model_full(image: np.ndarray, model_name: str):
376
- global models, model_configs
377
- pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
378
- processor = models[model_name]['processor']
379
- model = models[model_name]['model']
380
- device = models[model_name]['device']
381
-
382
- pixel_values = processor(images=pil_image, return_tensors="pt").pixel_values.to(device)
383
- generated_ids = model.generate(pixel_values)
384
- generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
385
-
386
- return {'text': generated_text, 'confidence': model_configs[model_name]['threshold']}
387
-
388
- def _process_easyocr(image: np.ndarray):
389
- global models
390
- results = models['easyocr'].readtext(image)
391
- if not results:
392
- return {'text': '', 'confidence': 0}
393
- texts = [text for _, text, _ in results]
394
- confidence_sum = sum(conf for _, _, conf in results)
395
- confidence_avg = confidence_sum / len(results) if results else 0
396
- return {'text': ' '.join(texts), 'confidence': confidence_avg}
397
-
398
- def _process_tesseract(image: np.ndarray):
399
- text = pytesseract.image_to_string(image, config='--oem 3 --psm 6 -l fas+eng')
400
- return {'text': text, 'confidence': 0.5}
401
-
402
- def _process_persian_ocr(image: np.ndarray):
403
- try:
404
- if image is None or not isinstance(image, np.ndarray) or image.size == 0 or len(image.shape) < 2:
405
- return {'text': '', 'confidence': 0}
406
- text = persian_ocr_main(image, langs="fa", mode="tn")
407
- return {'text': text, 'confidence': 0.75}
408
- except Exception as e:
409
- logger.error(f"FAILURE Persian-OCR processing failed: {str(e)}")
410
- return {'text': '', 'confidence': 0}
411
-
412
- def _format_result(text: str, confidence: float, model_name: str, processing_time: float):
413
- global normalizer
414
- normalized_text = normalizer.normalize(text)
415
- words = word_tokenize(normalized_text)
416
- persian_nums = '۰۱۲۳۴۵۶۷۸۹'
417
- number_pattern = f'^[0-9{persian_nums}]+([\\.,،٫][0-9{persian_nums}]+)?$'
418
- numbers = [w for w in words if re.match(number_pattern, w)]
419
- text_list = [w for w in words if not re.match(number_pattern, w)]
420
-
421
- return OCRResult(
422
- text=text_list,
423
- numbers=numbers,
424
- confidence=confidence,
425
- model_name=model_name,
426
- processing_time=processing_time,
427
- image_quality=_assess_quality(text_list),
428
- detected_language=_detect_language(text_list),
429
- word_count=len(text_list),
430
- char_count=sum(len(w) for w in text_list),
431
- preprocessing_info={},
432
- error_rate=_estimate_error_rate(text_list, confidence)
433
- )
434
-
435
- def _estimate_error_rate(text_list: List[str], confidence: float):
436
- if not text_list:
437
- return 1.0
438
- avg_word_length = sum(len(w) for w in text_list) / len(text_list) if text_list else 0
439
- return max(0.0, min(1.0, 1.0 - confidence + (3 - avg_word_length) / 10))
440
-
441
- def _assess_quality(text_list: List[str]):
442
- if not text_list:
443
- return "Low"
444
- avg_word_length = sum(len(w) for w in text_list) / len(text_list) if text_list else 0
445
- word_count = len(text_list)
446
- return "High" if word_count > 50 and avg_word_length > 3 else "Medium" if word_count > 20 and avg_word_length > 2 else "Low"
447
-
448
- def _detect_language(text_list: List[str]):
449
- if not text_list:
450
- return "Unknown"
451
- persian_pattern = re.compile(r'[\u0600-\u06FF]')
452
- english_pattern = re.compile(r'[a-zA-Z]')
453
- persian_chars = sum(1 for word in text_list for _ in persian_pattern.finditer(word))
454
- english_chars = sum(1 for word in text_list for _ in english_pattern.finditer(word))
455
- return "Persian" if persian_chars > english_chars else "English" if english_chars > persian_chars else "Mixed"
456
-
457
- # --- ImagePreprocessor functions ---
458
- default_preprocessing_settings = {
459
- 'resize': True, 'resize_scale': 200, 'enhance_contrast': True, 'reduce_noise': True,
460
- 'sharpen': True, 'deskew': True, 'threshold': True
461
- }
462
-
463
- def enhance_for_persian(image: np.ndarray, settings: Dict[str, Any], progress=None):
464
- info = {}
465
- if image is None or not isinstance(image, np.ndarray) or image.size == 0 or len(image.shape) < 2:
466
- logger.error("FAILURE Input image to enhance_for_persian is invalid or empty")
467
- return None, {}
468
-
469
- logger.debug(f"Enhancing image with shape: {image.shape}")
470
- processed = image.copy()
471
- current_settings = default_preprocessing_settings.copy()
472
- current_settings.update(settings)
473
-
474
- try:
475
- step = 0
476
- total_steps = 7
477
- if progress:
478
- progress(step / total_steps)
479
-
480
- height, width = processed.shape[:2]
481
- if height <= 0 or width <= 0:
482
- logger.error(f"FAILURE Invalid image dimensions: height={height}, width={width}")
483
- return None, {}
484
-
485
- logger.debug(f"Image shape before grayscale: {processed.shape}")
486
- if len(processed.shape) == 3:
487
- try:
488
- processed = cv2.cvtColor(processed, cv2.COLOR_BGR2GRAY)
489
- info['grayscale'] = True
490
- except cv2.error as e:
491
- logger.error(f"FAILURE Grayscale conversion failed: {str(e)}")
492
- return None, {}
493
- step += 1
494
- if progress:
495
- progress(step / total_steps)
496
-
497
- logger.debug(f"Image shape after grayscale: {processed.shape}")
498
- if current_settings.get('resize'):
499
- scale_percent = current_settings.get('resize_scale', 200)
500
- if scale_percent != 100:
501
- new_width = int(width * scale_percent / 100)
502
- new_height = int(height * scale_percent / 100)
503
- if new_width > 0 and new_height > 0:
504
- try:
505
- processed = cv2.resize(processed, (new_width, new_height), interpolation=cv2.INTER_CUBIC)
506
- info['resized'] = f"{scale_percent}%"
507
- except cv2.error as e:
508
- logger.error(f"FAILURE Resize failed: {str(e)}")
509
- return None, {}
510
- else:
511
- logger.warning(f"WARNING Resize skipped due to invalid dimensions: width={new_width}, height={new_height}")
512
- step += 1
513
- if progress:
514
- progress(step / total_steps)
515
-
516
- logger.debug(f"Image shape after resize: {processed.shape}")
517
- if current_settings.get('enhance_contrast'):
518
- try:
519
- clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
520
- processed = clahe.apply(processed)
521
- info['contrast_enhanced'] = True
522
- except cv2.error as e:
523
- logger.error(f"FAILURE Contrast enhancement failed: {str(e)}")
524
- return None, {}
525
- step += 1
526
- if progress:
527
- progress(step / total_steps)
528
-
529
- logger.debug(f"Image shape after contrast: {processed.shape}")
530
- if current_settings.get('reduce_noise'):
531
- try:
532
- processed = cv2.bilateralFilter(processed, 9, 75, 75)
533
- info['noise_reduced'] = True
534
- except cv2.error as e:
535
- logger.error(f"FAILURE Noise reduction failed: {str(e)}")
536
- return None, {}
537
- step += 1
538
- if progress:
539
- progress(step / total_steps)
540
-
541
- logger.debug(f"Image shape after noise reduction: {processed.shape}")
542
- if current_settings.get('sharpen'):
543
- try:
544
- kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
545
- processed = cv2.filter2D(processed, -1, kernel)
546
- info['sharpened'] = True
547
- except cv2.error as e:
548
- logger.error(f"FAILURE Sharpening failed: {str(e)}")
549
- return None, {}
550
- step += 1
551
- if progress:
552
- progress(step / total_steps)
553
-
554
- logger.debug(f"Image shape after sharpen: {processed.shape}")
555
- if current_settings.get('deskew'):
556
- try:
557
- coords = np.column_stack(np.where(processed > 0))
558
- if coords.size > 0:
559
- angle = cv2.minAreaRect(coords)[-1]
560
- if angle < -45:
561
- angle = 90 + angle
562
- center = (processed.shape[1] // 2, processed.shape[0] // 2)
563
- M = cv2.getRotationMatrix2D(center, angle, 1.0)
564
- processed = cv2.warpAffine(processed, M, (processed.shape[1], processed.shape[0]),
565
- flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
566
- info['deskewed'] = f"angle: {angle:.2f}"
567
- else:
568
- logger.warning("WARNING No contours found for deskewing")
569
- except Exception as e:
570
- logger.warning(f"WARNING Deskew failed: {e}")
571
- step += 1
572
- if progress:
573
- progress(step / total_steps)
574
-
575
- logger.debug(f"Image shape after deskew: {processed.shape}")
576
- if current_settings.get('threshold'):
577
- try:
578
- processed = cv2.adaptiveThreshold(processed, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
579
- cv2.THRESH_BINARY, 11, 2)
580
- info['thresholded'] = True
581
- except cv2.error as e:
582
- logger.error(f"FAILURE Thresholding failed: {str(e)}")
583
- return None, {}
584
- step += 1
585
- if progress:
586
- progress(1.0)
587
-
588
- logger.debug(f"Image shape after threshold: {processed.shape}")
589
- return processed, info
590
- except Exception as e:
591
- logger.error(f"FAILURE Preprocessing error: {str(e)}")
592
- if progress:
593
- progress(1.0)
594
- return None, {}
595
-
596
- def remove_background(image: np.ndarray):
597
- if image is None or not isinstance(image, np.ndarray) or image.size == 0 or len(image.shape) < 2:
598
- logger.error("FAILURE Input image for background removal is invalid or empty")
599
- return None
600
- try:
601
- logger.debug(f"Removing background from image with shape: {image.shape}")
602
- gray = image if len(image.shape) == 2 else cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
603
- mask = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
604
- kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
605
- mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel, iterations=1)
606
- return cv2.bitwise_and(image, image, mask=mask)
607
- except Exception as e:
608
- logger.error(f"FAILURE Background removal failed: {str(e)}")
609
- return None
610
-
611
- # --- PDFProcessor functions ---
612
- pdf_temp_dir = Path(tempfile.mkdtemp())
613
- pdf_processing_settings = {
614
- 'dpi': 300, 'scale_factor': 2, 'split_pages': True, 'extract_images': True, 'batch_size': 2
615
- }
616
-
617
- def process_pdf_document(pdf_path: str, settings: Dict[str, Any], progress=None):
618
- global pdf_temp_dir, pdf_processing_settings
619
- logger.info(f"START Processing PDF: {pdf_path}")
620
- all_images = []
621
-
622
- if not os.path.exists(pdf_path):
623
- logger.error(f"FAILURE PDF file not found: {pdf_path}")
624
- return []
625
-
626
- try:
627
- doc = fitz.open(pdf_path)
628
- total_pages = len(doc)
629
- batch_size = settings.get('batch_size', pdf_processing_settings['batch_size'])
630
- batches = [range(i, min(i + batch_size, total_pages)) for i in range(0, total_pages, batch_size)]
631
-
632
- with tqdm(total=total_pages, desc="📄 Processing PDF") as pbar:
633
- for batch in batches:
634
- with ThreadPoolExecutor(max_workers=batch_size) as executor:
635
- futures = {executor.submit(_process_pdf_page, doc, page_num, settings, progress): page_num
636
- for page_num in batch}
637
- for future in as_completed(futures):
638
- result = future.result()
639
- if result and isinstance(result, list):
640
- all_images.extend(result)
641
- pbar.update(1)
642
-
643
- logger.info(f"SUCCESS Extracted {len(all_images)} images")
644
- return all_images
645
- except Exception as e:
646
- logger.error(f"FAILURE PDF processing failed: {str(e)}")
647
- return []
648
-
649
- def _process_pdf_page(doc, page_num: int, settings: Dict[str, Any], progress=None):
650
- images = []
651
- try:
652
- page = doc.load_page(page_num)
653
- pix = page.get_pixmap(matrix=fitz.Matrix(settings.get('scale_factor', 2), settings.get('scale_factor', 2)))
654
- if pix.n <= 0 or pix.width <= 0 or pix.height <= 0 or not pix.samples:
655
- logger.error(f"FAILURE Invalid pixmap data for page {page_num + 1}")
656
- return []
657
-
658
- img_data = np.frombuffer(pix.samples, dtype=np.uint8)
659
- expected_size = pix.width * pix.height * pix.n
660
- if img_data.size != expected_size:
661
- logger.error(f"FAILURE Pixmap data size mismatch for page {page_num + 1}: expected {expected_size}, got {img_data.size}")
662
- return []
663
-
664
- img = img_data.reshape(pix.height, pix.width, pix.n)
665
- logger.debug(f"Image shape from pixmap: {img.shape}")
666
- processed_img, _ = enhance_for_persian(img, settings, progress)
667
- if processed_img is not None:
668
- images.append(processed_img)
669
-
670
- if settings.get('extract_images', True):
671
- for img_info in page.get_images(full=True):
672
- xref = img_info[0]
673
- try:
674
- base_image = fitz.Pixmap(doc, xref)
675
- if base_image.n >= 4:
676
- base_image = fitz.Pixmap(fitz.csRGB, base_image)
677
- if base_image.n <= 0 or base_image.width <= 0 or base_image.height <= 0 or not base_image.samples:
678
- logger.warning(f"WARNING Invalid extracted pixmap for page {page_num + 1}, skipping")
679
- continue
680
- img_array = np.frombuffer(base_image.samples, dtype=np.uint8).reshape(
681
- base_image.height, base_image.width, 3 if base_image.n >= 3 else 1)
682
- if img_array.shape[0] > 100 and img_array.shape[1] > 100:
683
- processed_img_extracted, _ = enhance_for_persian(img_array, settings, progress)
684
- if processed_img_extracted is not None:
685
- images.append(processed_img_extracted)
686
- except Exception as e:
687
- logger.warning(f"WARNING Failed to process extracted image for page {page_num + 1}: {str(e)}")
688
- continue
689
- if progress:
690
- progress(1.0)
691
- return images
692
- except Exception as e:
693
- logger.error(f"FAILURE Page {page_num + 1} processing failed: {str(e)}")
694
- if progress:
695
- progress(1.0)
696
- return []
697
-
698
- def optimize_pdf_document(pdf_path: str, settings: Dict[str, Any], progress=None):
699
- global pdf_temp_dir, default_preprocessing_settings
700
- logger.info(f"START Optimizing PDF: {pdf_path}")
701
- logger.debug(f"PDF Path for optimization: {pdf_path}")
702
- if not os.path.exists(pdf_path):
703
- logger.error(f"FAILURE PDF file not found: {pdf_path}")
704
- return pdf_path
705
-
706
- try:
707
- output_path = Path(f"/app/optimized_{Path(pdf_path).name}")
708
- doc = fitz.open(pdf_path)
709
- new_doc = fitz.open()
710
-
711
- total_pages = len(doc)
712
- for page_num in tqdm(range(total_pages), desc="📄 Optimizing PDF"):
713
- page = doc.load_page(page_num)
714
- pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
715
- if pix.n <= 0 or pix.width <= 0 or pix.height <= 0 or not pix.samples:
716
- logger.error(f"FAILURE Invalid pixmap data for page {page_num + 1}")
717
- continue
718
-
719
- img_data = np.frombuffer(pix.samples, dtype=np.uint8)
720
- expected_size = pix.width * pix.height * pix.n
721
- if img_data.size != expected_size:
722
- logger.error(f"FAILURE Pixmap data size mismatch for page {page_num + 1}: expected {expected_size}, got {img_data.size}")
723
- continue
724
-
725
- img = img_data.reshape(pix.height, pix.width, pix.n)
726
- logger.debug(f"Image shape from pixmap: {img.shape}")
727
- processed_img, _ = enhance_for_persian(img, settings, progress)
728
- if processed_img is None:
729
- continue
730
-
731
- img_path = pdf_temp_dir / f"temp_page_{page_num}.jpg"
732
- cv2.imwrite(str(img_path), processed_img)
733
- temp_doc = fitz.open(str(img_path))
734
- new_doc.insert_pdf(temp_doc)
735
- temp_doc.close()
736
- os.remove(img_path)
737
-
738
- new_doc.save(str(output_path))
739
- logger.info(f"SUCCESS PDF optimized: {output_path}")
740
- if progress:
741
- progress(1.0)
742
- return str(output_path)
743
- except Exception as e:
744
- logger.error(f"FAILURE PDF optimization failed: {str(e)}")
745
- if progress:
746
- progress(1.0)
747
- return pdf_path
748
-
749
- def cleanup_pdf_temp_dir():
750
- global pdf_temp_dir
751
- try:
752
- shutil.rmtree(pdf_temp_dir)
753
- except Exception as e:
754
- logger.error(f"FAILURE Temp cleanup failed: {str(e)}")
755
-
756
- # --- Cache functions ---
757
- cache_data = OrderedDict()
758
- cache_max_size = 1000
759
- cache_lock = threading.Lock()
760
- cache_dir_path = Path("/app/cache")
761
-
762
- def setup_cache_dir():
763
- global cache_dir_path
764
- cache_dir_path.mkdir(exist_ok=True)
765
-
766
- def _get_cache_key(image: np.ndarray):
767
- return hashlib.md5(image.tobytes()).hexdigest()
768
-
769
- def get_cache(image: np.ndarray):
770
- global cache_data, cache_lock
771
- key = _get_cache_key(image)
772
- with cache_lock:
773
- if key in cache_data:
774
- value = cache_data.pop(key)
775
- cache_data[key] = value
776
- return pickle.loads(value)
777
- return None
778
-
779
- def set_cache(image: np.ndarray, result: OCRResult):
780
- global cache_data, cache_max_size, cache_lock
781
- key = _get_cache_key(image)
782
- with cache_lock:
783
- if len(cache_data) >= cache_max_size:
784
- cache_data.popitem(last=False)
785
- cache_data[key] = pickle.dumps(result)
786
-
787
- # --- PersianOCR functions (Main logic) ---
788
- default_ocr_settings = {
789
- 'resize': True, 'resize_scale': 200, 'enhance_contrast': True, 'reduce_noise': True,
790
- 'sharpen': True, 'deskew': True, 'optimize_for_ocr': True, 'extract_images': True,
791
- 'cache_enabled': True, 'max_workers': 4
792
- }
793
-
794
- def process_single_image(image: Union[str, np.ndarray], settings: Optional[Dict] = None, progress=None):
795
- global default_ocr_settings
796
- start_time = time.time()
797
- current_settings = default_ocr_settings.copy()
798
- if settings:
799
- current_settings.update(settings)
800
-
801
- try:
802
- if isinstance(image, str):
803
- logger.debug(f"Loading image from path: {image}")
804
- image = cv2.imread(image)
805
- if image is None:
806
- logger.error(f"FAILURE Failed to load image from path: {image}")
807
- return OCRResult([], [], 0.0, "None", 0.0, "Unknown", "Unknown", 0, 0, {}, 0.0)
808
- image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
809
- logger.debug(f"Image loaded with shape: {image.shape}")
810
-
811
- if current_settings.get('cache_enabled'):
812
- cached_result = get_cache(image)
813
- if cached_result:
814
- logger.info("SUCCESS Cache hit")
815
- return cached_result
816
-
817
- result = process_image(image, progress)
818
- if result and result.text:
819
- if current_settings.get('cache_enabled'):
820
- set_cache(image, result)
821
- return result
822
-
823
- processed_image, preprocess_info = enhance_for_persian(image, current_settings, progress)
824
- if processed_image is None:
825
- return OCRResult([], [], 0.0, "None", time.time() - start_time, "Unknown", "Unknown", 0, 0, preprocess_info, 0.0)
826
-
827
- result = process_image(processed_image, progress)
828
- if result:
829
- result = result._replace(preprocessing_info=preprocess_info)
830
- if current_settings.get('cache_enabled'):
831
- set_cache(image, result)
832
- return result
833
-
834
- preprocess_info = preprocess_info if 'preprocess_info' in locals() else {}
835
- return OCRResult([], [], 0.0, "None", time.time() - start_time, "Unknown", "Unknown", 0, 0, preprocess_info, 0.0)
836
-
837
- except Exception as e:
838
- logger.error(f"FAILURE Image processing failed: {str(e)}")
839
- return OCRResult([], [], 0.0, "Error", time.time() - start_time, "Unknown", "Unknown", 0, 0, {}, 0.0)
840
-
841
- def process_pdf(pdf_path: str, settings: Optional[Dict] = None, progress=None):
842
- global default_ocr_settings
843
- current_settings = default_ocr_settings.copy()
844
- if settings:
845
- current_settings.update(settings)
846
-
847
- logger.info(f"START Processing PDF: {pdf_path}")
848
- results = []
849
-
850
- if not pdf_path or not os.path.exists(pdf_path):
851
- logger.error(f"FAILURE PDF file not found or invalid: {pdf_path}")
852
- return [OCRResult([], [], 0.0, "None", 0.0, "Unknown", "Unknown", 0, 0, {}, 0.0)]
853
-
854
- try:
855
- optimized_pdf = optimize_pdf_document(pdf_path, current_settings, progress)
856
- if not optimized_pdf or not os.path.exists(optimized_pdf):
857
- logger.error(f"FAILURE Optimized PDF not generated: {optimized_pdf}")
858
- return [OCRResult([], [], 0.0, "None", 0.0, "Unknown", "Unknown", 0, 0, {}, 0.0)]
859
-
860
- images = process_pdf_document(optimized_pdf, current_settings, progress)
861
- if not images:
862
- logger.warning("WARNING No images extracted from PDF")
863
- return [OCRResult([], [], 0.0, "None", 0.0, "Unknown", "Unknown", 0, 0, {}, 0.0)]
864
-
865
- with ThreadPoolExecutor(max_workers=current_settings.get('max_workers')) as executor:
866
- futures = [executor.submit(process_single_image, img, current_settings, progress) for img in images if img is not None]
867
- for future in as_completed(futures):
868
- result = future.result()
869
- if result and result.text:
870
- results.append(result)
871
-
872
- logger.info(f"SUCCESS Processed {len(results)} pages")
873
- return results if results else [OCRResult([], [], 0.0, "None", 0.0, "Unknown", "Unknown", 0, 0, {}, 0.0)]
874
- except Exception as e:
875
- logger.error(f"FAILURE PDF processing failed: {str(e)}")
876
- return [OCRResult([], [], 0.0, "Error", 0.0, "Unknown", "Unknown", 0, 0, {}, 0.0)]
877
-
878
- # --- رابط کاربری Gradio ---
879
- def create_gradio_interface():
880
- def process_file(file, use_cache: bool, preprocessing: bool, confidence: float, scale: int,
881
- enhance_contrast: bool, reduce_noise: bool, extract_images: bool):
882
- if file is None:
883
- logger.error("FAILURE No file provided")
884
- return ("", "", "0.0", "None", "0.0", "Unknown", "No file uploaded")
885
-
886
- settings = {
887
- 'cache_enabled': use_cache, 'preprocessing_enabled': preprocessing, 'confidence_threshold': confidence,
888
- 'resize': True, 'resize_scale': scale, 'enhance_contrast': enhance_contrast,
889
- 'reduce_noise': reduce_noise, 'extract_images': extract_images, 'sharpen': True, 'deskew': True,
890
- 'optimize_for_ocr': True
891
- }
892
-
893
- progress = gr.Progress(track_tqdm=True)
894
-
895
- try:
896
- if file.name.lower().endswith('.pdf'):
897
- results = process_pdf(file.name, settings, progress)
898
- full_text = ""
899
- numbers_combined = []
900
- confidences = []
901
- models_used = []
902
- times = []
903
- qualities = []
904
- preprocess_infos = []
905
-
906
- for res in results:
907
- full_text += "\n" + " ".join(res.text)
908
- numbers_combined.extend(res.numbers)
909
- confidences.append(f"{res.confidence:.2f}")
910
- models_used.append(res.model_name)
911
- times.append(f"{res.processing_time:.2f} seconds")
912
- qualities.append(res.image_quality)
913
- preprocess_infos.append("\n".join([f"{k}: {v}" for k, v in res.preprocessing_info.items()]))
914
-
915
- combined_preprocess_info = "\nPage-wise Preprocessing Info:\n" + "\n\n".join(preprocess_infos) if preprocess_infos else ""
916
-
917
- return (
918
- full_text.strip(),
919
- ", ".join(numbers_combined),
920
- ", ".join(confidences),
921
- ", ".join(models_used),
922
- ", ".join(times),
923
- ", ".join(qualities),
924
- combined_preprocess_info
925
- )
926
-
927
- else:
928
- result = process_single_image(file.name, settings, progress)
929
- if result and result.text:
930
- preprocess_info = "\n".join([f"{k}: {v}" for k, v in result.preprocessing_info.items()]) if result.preprocessing_info else ""
931
- return (
932
- "\n".join(result.text),
933
- ", ".join(result.numbers),
934
- f"{result.confidence:.2f}",
935
- result.model_name,
936
- f"{result.processing_time:.2f} seconds",
937
- result.image_quality,
938
- preprocess_info
939
- )
940
- return ("", "", "0.0", "None", "0.0", "Unknown", "No text extracted")
941
-
942
- except Exception as e:
943
- logger.error(f"FAILURE Interface error: {str(e)}")
944
- return ("", "", "0.0", "Error", "0.0", "Unknown", str(e))
945
-
946
- with gr.Blocks(title="سیستم OCR فارسی پیشرفته") as interface:
947
- gr.Markdown("# سیستم OCR فارسی پیشرفته")
948
- with gr.Row():
949
- with gr.Column():
950
- file_input = gr.File(label="آپلود فایل (تصویر یا PDF)")
951
- with gr.Accordion("تنظیمات پیشرفته", open=False):
952
- use_cache = gr.Checkbox(label="استفاده از کش (Cache)", value=True)
953
- preprocessing = gr.Checkbox(label="فعال‌سازی پیش‌پردازش", value=True)
954
- confidence = gr.Slider(0.1, 1.0, value=0.7, label="آستانه اطمینان (Confidence Threshold)")
955
- scale = gr.Slider(100, 400, value=200, step=50, label="مقیاس تصویر (%)")
956
- enhance_contrast = gr.Checkbox(label="بهبود کنتراست", value=True)
957
- reduce_noise = gr.Checkbox(label="کاهش نویز", value=True)
958
- extract_images = gr.Checkbox(label="استخراج تصاویر از PDF", value=True)
959
- submit_btn = gr.Button("پردازش متن")
960
- with gr.Column():
961
- outputs = [
962
- gr.Textbox(label="متن استخراج‌شده", lines=10),
963
- gr.Textbox(label="اعداد استخراج‌شده", lines=2),
964
- gr.Textbox(label="میزان اطمینان (Confidence)"),
965
- gr.Textbox(label="مدل OCR استفاده‌شده"),
966
- gr.Textbox(label="زمان پردازش"),
967
- gr.Textbox(label="کیفیت تصویر"),
968
- gr.Textbox(label="اطلاعات پیش‌پردازش", lines=5)
969
- ]
970
-
971
- submit_btn.click(
972
- fn=process_file,
973
- inputs=[file_input, use_cache, preprocessing, confidence, scale, enhance_contrast, reduce_noise, extract_images],
974
- outputs=outputs
975
- )
976
- return interface
977
-
978
- # --- تابع اصلی ---
979
- def main():
980
- try:
981
- logger.info("START Initializing system")
982
- setup_system_dependencies() # نصب وابستگی‌های سیستمی
983
- os.makedirs('/app/logs', exist_ok=True)
984
- os.makedirs('/app/cache', exist_ok=True)
985
- setup_cache_dir()
986
-
987
- device = "GPU" if torch.cuda.is_available() else "CPU"
988
- logger.info(f"SUCCESS Using {device}")
989
-
990
- # اجرای Gradio برای Hugging Face Spaces
991
- interface = create_gradio_interface()
992
- interface.launch(server_name="0.0.0.0", server_port=7860, share=False)
993
- except Exception as e:
994
- logger.error(f"FAILURE Main error: {str(e)}")
995
- raise
996
-
997
- if __name__ == "__main__":
998
- main()