Spaces:
Running
Running
| from typing import List, Optional | |
| import numpy as np | |
| import pytesseract | |
| from pytesseract import Output | |
| from tqdm import tqdm | |
| from surya.input.processing import slice_bboxes_from_image | |
| from surya.settings import settings | |
| import os | |
| from concurrent.futures import ProcessPoolExecutor | |
| from surya.detection import get_batch_size as get_det_batch_size | |
| from surya.recognition import get_batch_size as get_rec_batch_size | |
| from surya.languages import CODE_TO_LANGUAGE | |
| def surya_lang_to_tesseract(code: str) -> Optional[str]: | |
| lang_str = CODE_TO_LANGUAGE[code] | |
| try: | |
| tess_lang = TESS_LANGUAGE_TO_CODE[lang_str] | |
| except KeyError: | |
| return None | |
| return tess_lang | |
| def tesseract_ocr(img, bboxes, lang: str): | |
| line_imgs = slice_bboxes_from_image(img, bboxes) | |
| config = f'--tessdata-dir "{settings.TESSDATA_PREFIX}"' | |
| lines = [] | |
| for line_img in line_imgs: | |
| line = pytesseract.image_to_string(line_img, lang=lang, config=config) | |
| lines.append(line) | |
| return lines | |
| def tesseract_ocr_parallel(imgs, bboxes, langs: List[str], cpus=None): | |
| tess_parallel_cores = min(len(imgs), get_rec_batch_size()) | |
| if not cpus: | |
| cpus = os.cpu_count() | |
| tess_parallel_cores = min(tess_parallel_cores, cpus) | |
| # Tesseract uses up to 4 processes per instance | |
| # Divide by 2 because tesseract doesn't seem to saturate all 4 cores with these small images | |
| tess_parallel = max(tess_parallel_cores // 2, 1) | |
| with ProcessPoolExecutor(max_workers=tess_parallel) as executor: | |
| tess_text = tqdm(executor.map(tesseract_ocr, imgs, bboxes, langs), total=len(imgs), desc="Running tesseract OCR") | |
| tess_text = list(tess_text) | |
| return tess_text | |
| def tesseract_bboxes(img): | |
| arr_img = np.asarray(img, dtype=np.uint8) | |
| ocr = pytesseract.image_to_data(arr_img, output_type=Output.DICT) | |
| bboxes = [] | |
| n_boxes = len(ocr['level']) | |
| for i in range(n_boxes): | |
| # It is possible to merge by line here with line number, but it gives bad results. | |
| _, x, y, w, h = ocr['text'][i], ocr['left'][i], ocr['top'][i], ocr['width'][i], ocr['height'][i] | |
| bbox = (x, y, x + w, y + h) | |
| bboxes.append(bbox) | |
| return bboxes | |
| def tesseract_parallel(imgs): | |
| # Tesseract uses 4 threads per instance | |
| tess_parallel_cores = min(len(imgs), get_det_batch_size()) | |
| cpus = os.cpu_count() | |
| tess_parallel_cores = min(tess_parallel_cores, cpus) | |
| # Tesseract uses 4 threads per instance | |
| tess_parallel = max(tess_parallel_cores // 4, 1) | |
| with ProcessPoolExecutor(max_workers=tess_parallel) as executor: | |
| tess_bboxes = tqdm(executor.map(tesseract_bboxes, imgs), total=len(imgs), desc="Running tesseract bbox detection") | |
| tess_bboxes = list(tess_bboxes) | |
| return tess_bboxes | |
| TESS_CODE_TO_LANGUAGE = { | |
| "afr": "Afrikaans", | |
| "amh": "Amharic", | |
| "ara": "Arabic", | |
| "asm": "Assamese", | |
| "aze": "Azerbaijani", | |
| "bel": "Belarusian", | |
| "ben": "Bengali", | |
| "bod": "Tibetan", | |
| "bos": "Bosnian", | |
| "bre": "Breton", | |
| "bul": "Bulgarian", | |
| "cat": "Catalan", | |
| "ceb": "Cebuano", | |
| "ces": "Czech", | |
| "chi_sim": "Chinese", | |
| "chr": "Cherokee", | |
| "cym": "Welsh", | |
| "dan": "Danish", | |
| "deu": "German", | |
| "dzo": "Dzongkha", | |
| "ell": "Greek", | |
| "eng": "English", | |
| "epo": "Esperanto", | |
| "est": "Estonian", | |
| "eus": "Basque", | |
| "fas": "Persian", | |
| "fin": "Finnish", | |
| "fra": "French", | |
| "fry": "Western Frisian", | |
| "guj": "Gujarati", | |
| "gla": "Scottish Gaelic", | |
| "gle": "Irish", | |
| "glg": "Galician", | |
| "heb": "Hebrew", | |
| "hin": "Hindi", | |
| "hrv": "Croatian", | |
| "hun": "Hungarian", | |
| "hye": "Armenian", | |
| "iku": "Inuktitut", | |
| "ind": "Indonesian", | |
| "isl": "Icelandic", | |
| "ita": "Italian", | |
| "jav": "Javanese", | |
| "jpn": "Japanese", | |
| "kan": "Kannada", | |
| "kat": "Georgian", | |
| "kaz": "Kazakh", | |
| "khm": "Khmer", | |
| "kir": "Kyrgyz", | |
| "kor": "Korean", | |
| "lao": "Lao", | |
| "lat": "Latin", | |
| "lav": "Latvian", | |
| "lit": "Lithuanian", | |
| "mal": "Malayalam", | |
| "mar": "Marathi", | |
| "mkd": "Macedonian", | |
| "mlt": "Maltese", | |
| "mon": "Mongolian", | |
| "msa": "Malay", | |
| "mya": "Burmese", | |
| "nep": "Nepali", | |
| "nld": "Dutch", | |
| "nor": "Norwegian", | |
| "ori": "Oriya", | |
| "pan": "Punjabi", | |
| "pol": "Polish", | |
| "por": "Portuguese", | |
| "pus": "Pashto", | |
| "ron": "Romanian", | |
| "rus": "Russian", | |
| "san": "Sanskrit", | |
| "sin": "Sinhala", | |
| "slk": "Slovak", | |
| "slv": "Slovenian", | |
| "snd": "Sindhi", | |
| "spa": "Spanish", | |
| "sqi": "Albanian", | |
| "srp": "Serbian", | |
| "swa": "Swahili", | |
| "swe": "Swedish", | |
| "syr": "Syriac", | |
| "tam": "Tamil", | |
| "tel": "Telugu", | |
| "tgk": "Tajik", | |
| "tha": "Thai", | |
| "tir": "Tigrinya", | |
| "tur": "Turkish", | |
| "uig": "Uyghur", | |
| "ukr": "Ukrainian", | |
| "urd": "Urdu", | |
| "uzb": "Uzbek", | |
| "vie": "Vietnamese", | |
| "yid": "Yiddish" | |
| } | |
| TESS_LANGUAGE_TO_CODE = {v:k for k,v in TESS_CODE_TO_LANGUAGE.items()} | |