Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -158,7 +158,7 @@ class OCRSystem:
|
|
158 |
"""Initialize OCR models configuration based on available hardware"""
|
159 |
self.models = {}
|
160 |
self.model_performance = {
|
161 |
-
'microsoft_trocr': {'success': 0, 'fail': 0},
|
162 |
'pretrained_model': {'success': 0, 'fail': 0},
|
163 |
'mT5_OCR_fa': {'success': 0, 'fail': 0},
|
164 |
'LayoutLMv3_fa': {'success': 0, 'fail': 0},
|
@@ -172,7 +172,7 @@ class OCRSystem:
|
|
172 |
self.logger.info(f"Using device: {self.device}, Max workers: {self.max_workers}")
|
173 |
|
174 |
self.model_configs = {
|
175 |
-
'microsoft_trocr': {
|
176 |
'name': "microsoft/trocr-base-printed",
|
177 |
'type': "transformer",
|
178 |
'threshold': 0.85,
|
@@ -223,7 +223,7 @@ class OCRSystem:
|
|
223 |
self.logger.warning(f"WARNING: Cannot create {tessdata_dir}: {str(e)}. Tesseract may fail without language data.")
|
224 |
return False
|
225 |
|
226 |
-
base_url = "https://github.com/tesseract-ocr/
|
227 |
|
228 |
for lang in languages:
|
229 |
file_path = os.path.join(tessdata_dir, f"{lang}.traineddata")
|
@@ -269,13 +269,15 @@ class OCRSystem:
|
|
269 |
return False
|
270 |
|
271 |
# Download Tesseract language data
|
272 |
-
self.download_tessdata(['eng', 'fas'])
|
273 |
-
tessdata_prefix = TESSDATA_LOCAL
|
274 |
os.environ['TESSDATA_PREFIX'] = tessdata_prefix
|
275 |
pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD
|
276 |
|
277 |
version = subprocess.check_output([TESSERACT_CMD, '--version'])
|
278 |
self.logger.info(f"SUCCESS: Tesseract installed: {version.decode()[:50]}")
|
|
|
|
|
279 |
self.logger.info("SUCCESS: System dependencies setup completed")
|
280 |
return True
|
281 |
|
|
|
158 |
"""Initialize OCR models configuration based on available hardware"""
|
159 |
self.models = {}
|
160 |
self.model_performance = {
|
161 |
+
'microsoft_trocr': {'success': 0, 'fail': 0},
|
162 |
'pretrained_model': {'success': 0, 'fail': 0},
|
163 |
'mT5_OCR_fa': {'success': 0, 'fail': 0},
|
164 |
'LayoutLMv3_fa': {'success': 0, 'fail': 0},
|
|
|
172 |
self.logger.info(f"Using device: {self.device}, Max workers: {self.max_workers}")
|
173 |
|
174 |
self.model_configs = {
|
175 |
+
'microsoft_trocr': {
|
176 |
'name': "microsoft/trocr-base-printed",
|
177 |
'type': "transformer",
|
178 |
'threshold': 0.85,
|
|
|
223 |
self.logger.warning(f"WARNING: Cannot create {tessdata_dir}: {str(e)}. Tesseract may fail without language data.")
|
224 |
return False
|
225 |
|
226 |
+
base_url = "https://github.com/tesseract-ocr/tessdata_best/raw/main/" # Updated URL
|
227 |
|
228 |
for lang in languages:
|
229 |
file_path = os.path.join(tessdata_dir, f"{lang}.traineddata")
|
|
|
269 |
return False
|
270 |
|
271 |
# Download Tesseract language data
|
272 |
+
tessdata_downloaded = self.download_tessdata(['eng', 'fas'])
|
273 |
+
tessdata_prefix = TESSDATA_LOCAL if tessdata_downloaded else TESSDATA_PREFIX_DEFAULT
|
274 |
os.environ['TESSDATA_PREFIX'] = tessdata_prefix
|
275 |
pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD
|
276 |
|
277 |
version = subprocess.check_output([TESSERACT_CMD, '--version'])
|
278 |
self.logger.info(f"SUCCESS: Tesseract installed: {version.decode()[:50]}")
|
279 |
+
if not os.path.exists(os.path.join(tessdata_prefix, 'eng.traineddata')):
|
280 |
+
self.logger.warning(f"WARNING: Tesseract language data not found in {tessdata_prefix}, functionality may be limited")
|
281 |
self.logger.info("SUCCESS: System dependencies setup completed")
|
282 |
return True
|
283 |
|