Really-amin commited on
Commit
ec92b9a
·
verified ·
1 Parent(s): 5b3b0b2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -5
app.py CHANGED
@@ -158,7 +158,7 @@ class OCRSystem:
158
  """Initialize OCR models configuration based on available hardware"""
159
  self.models = {}
160
  self.model_performance = {
161
- 'microsoft_trocr': {'success': 0, 'fail': 0}, # Added Microsoft TrOCR
162
  'pretrained_model': {'success': 0, 'fail': 0},
163
  'mT5_OCR_fa': {'success': 0, 'fail': 0},
164
  'LayoutLMv3_fa': {'success': 0, 'fail': 0},
@@ -172,7 +172,7 @@ class OCRSystem:
172
  self.logger.info(f"Using device: {self.device}, Max workers: {self.max_workers}")
173
 
174
  self.model_configs = {
175
- 'microsoft_trocr': { # Microsoft TrOCR for typed documents
176
  'name': "microsoft/trocr-base-printed",
177
  'type': "transformer",
178
  'threshold': 0.85,
@@ -223,7 +223,7 @@ class OCRSystem:
223
  self.logger.warning(f"WARNING: Cannot create {tessdata_dir}: {str(e)}. Tesseract may fail without language data.")
224
  return False
225
 
226
- base_url = "https://github.com/tesseract-ocr/tessdata/raw/master/"
227
 
228
  for lang in languages:
229
  file_path = os.path.join(tessdata_dir, f"{lang}.traineddata")
@@ -269,13 +269,15 @@ class OCRSystem:
269
  return False
270
 
271
  # Download Tesseract language data
272
- self.download_tessdata(['eng', 'fas'])
273
- tessdata_prefix = TESSDATA_LOCAL
274
  os.environ['TESSDATA_PREFIX'] = tessdata_prefix
275
  pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD
276
 
277
  version = subprocess.check_output([TESSERACT_CMD, '--version'])
278
  self.logger.info(f"SUCCESS: Tesseract installed: {version.decode()[:50]}")
 
 
279
  self.logger.info("SUCCESS: System dependencies setup completed")
280
  return True
281
 
 
158
  """Initialize OCR models configuration based on available hardware"""
159
  self.models = {}
160
  self.model_performance = {
161
+ 'microsoft_trocr': {'success': 0, 'fail': 0},
162
  'pretrained_model': {'success': 0, 'fail': 0},
163
  'mT5_OCR_fa': {'success': 0, 'fail': 0},
164
  'LayoutLMv3_fa': {'success': 0, 'fail': 0},
 
172
  self.logger.info(f"Using device: {self.device}, Max workers: {self.max_workers}")
173
 
174
  self.model_configs = {
175
+ 'microsoft_trocr': {
176
  'name': "microsoft/trocr-base-printed",
177
  'type': "transformer",
178
  'threshold': 0.85,
 
223
  self.logger.warning(f"WARNING: Cannot create {tessdata_dir}: {str(e)}. Tesseract may fail without language data.")
224
  return False
225
 
226
+ base_url = "https://github.com/tesseract-ocr/tessdata_best/raw/main/" # Updated URL
227
 
228
  for lang in languages:
229
  file_path = os.path.join(tessdata_dir, f"{lang}.traineddata")
 
269
  return False
270
 
271
  # Download Tesseract language data
272
+ tessdata_downloaded = self.download_tessdata(['eng', 'fas'])
273
+ tessdata_prefix = TESSDATA_LOCAL if tessdata_downloaded else TESSDATA_PREFIX_DEFAULT
274
  os.environ['TESSDATA_PREFIX'] = tessdata_prefix
275
  pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD
276
 
277
  version = subprocess.check_output([TESSERACT_CMD, '--version'])
278
  self.logger.info(f"SUCCESS: Tesseract installed: {version.decode()[:50]}")
279
+ if not os.path.exists(os.path.join(tessdata_prefix, 'eng.traineddata')):
280
+ self.logger.warning(f"WARNING: Tesseract language data not found in {tessdata_prefix}, functionality may be limited")
281
  self.logger.info("SUCCESS: System dependencies setup completed")
282
  return True
283