Spaces:

brestok
/

ocr-backend

Sleeping

App Files Files Community

brestok commited on Mar 2

Commit

150c3f8

1 Parent(s): 66dc64c

add ner

Browse files

Files changed (4) hide show

ocr/api/message/utils.py +16 -0
ocr/api/message/views.py +2 -6
ocr/core/config.py +2 -0
requirements.txt +65 -0

ocr/api/message/utils.py CHANGED Viewed

@@ -4,8 +4,11 @@ import re
 import pytesseract
 from PIL import Image
 from pdf2image import convert_from_bytes
 def divide_images(contents: bytes) -> list[bytes]:
     images = convert_from_bytes(contents, dpi=250)
@@ -49,3 +52,16 @@ def clean_response(text: str) -> str:
     except Exception as e:
         pass
     return text

 import pytesseract
 from PIL import Image
+from flair.data import Sentence
 from pdf2image import convert_from_bytes
+from ocr.core.config import settings
 def divide_images(contents: bytes) -> list[bytes]:
     images = convert_from_bytes(contents, dpi=250)
     except Exception as e:
         pass
     return text
+def clean_text(text: str) -> str:
+    sentence = Sentence(text)
+    settings.TAGGER.predict(sentence)
+    per_entities = [entity for entity in sentence.get_spans('ner') if entity.tag == 'PER']
+    per_entities = sorted(per_entities, key=lambda x: x.start_position, reverse=True)
+    cleaned_text = text
+    for entity in per_entities:
+        start = entity.start_position
+        end = entity.end_position
+        cleaned_text = cleaned_text[:start] + cleaned_text[end:]
+    return cleaned_text

ocr/api/message/views.py CHANGED Viewed

@@ -2,7 +2,7 @@ from fastapi import File, UploadFile, HTTPException
 from ocr.api.message import ocr_router
 from ocr.api.message.schemas import OcrResponse
-from ocr.api.message.utils import divide_images, clean_response, extract_text_from_images
 from ocr.core.wrappers import OcrResponseWrapper
@@ -21,11 +21,7 @@ async def get_all_chat_messages(
         else:
             raise HTTPException(status_code=400, detail='Unsupported file type.')
         text_content = extract_text_from_images(images)
-        # original_text, response = await asyncio.gather(
-        #     extract_original_text(text_content),
-            # generate_report(text_content)
-        # )
-        cleaned_original_text = text_content
         return OcrResponseWrapper(data=OcrResponse(text=clean_response(text_content), originalText=cleaned_original_text))
     finally:
         await file.close()

 from ocr.api.message import ocr_router
 from ocr.api.message.schemas import OcrResponse
+from ocr.api.message.utils import divide_images, clean_response, extract_text_from_images, clean_text
 from ocr.core.wrappers import OcrResponseWrapper
         else:
             raise HTTPException(status_code=400, detail='Unsupported file type.')
         text_content = extract_text_from_images(images)
+        cleaned_original_text = clean_text(text_content)
         return OcrResponseWrapper(data=OcrResponse(text=clean_response(text_content), originalText=cleaned_original_text))
     finally:
         await file.close()

ocr/core/config.py CHANGED Viewed

@@ -3,12 +3,14 @@ import pathlib
 from functools import lru_cache
 from dotenv import load_dotenv
 load_dotenv()
 class BaseConfig:
     BASE_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent
     SECRET_KEY = os.getenv('SECRET')
 class DevelopmentConfig(BaseConfig):
     Issuer = "http://localhost:8000"

 from functools import lru_cache
 from dotenv import load_dotenv
+from flair.models import SequenceTagger
 load_dotenv()
 class BaseConfig:
     BASE_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent
     SECRET_KEY = os.getenv('SECRET')
+    TAGGER = SequenceTagger.load("flair/ner-english-large")
 class DevelopmentConfig(BaseConfig):
     Issuer = "http://localhost:8000"

requirements.txt CHANGED Viewed

@@ -1,31 +1,96 @@
 annotated-types==0.7.0
 anyio==4.8.0
 certifi==2024.12.14
 click==8.1.8
 distro==1.9.0
 fastapi==0.115.6
 h11==0.14.0
 httpcore==1.0.7
 httptools==0.6.4
 httpx==0.28.1
 idna==3.10
 jiter==0.8.2
 openai==1.59.9
 packaging==24.2
 pdf2image==1.17.0
 pillow==11.1.0
 pydantic==2.10.5
 pydantic_core==2.27.2
 pydash==8.0.5
 pytesseract==0.3.13
 python-dotenv==1.0.1
 python-multipart==0.0.20
 PyYAML==6.0.2
 sniffio==1.3.1
 starlette==0.41.3
 tqdm==4.67.1
 typing_extensions==4.12.2
 uvicorn==0.34.0
 uvloop==0.21.0
 watchfiles==1.0.4
 websockets==14.2

+accelerate==1.4.0
 annotated-types==0.7.0
 anyio==4.8.0
+attrs==25.1.0
+beautifulsoup4==4.13.3
+bioc==2.1
+boto3==1.37.4
+botocore==1.37.4
 certifi==2024.12.14
+charset-normalizer==3.4.1
 click==8.1.8
+conllu==4.5.3
+contourpy==1.3.1
+cycler==0.12.1
+Deprecated==1.2.18
 distro==1.9.0
+docopt==0.6.2
 fastapi==0.115.6
+filelock==3.17.0
+flair==0.15.1
+fonttools==4.56.0
+fsspec==2025.2.0
+ftfy==6.3.1
+gdown==5.2.0
 h11==0.14.0
 httpcore==1.0.7
 httptools==0.6.4
 httpx==0.28.1
+huggingface-hub==0.29.1
 idna==3.10
+intervaltree==3.1.0
+Jinja2==3.1.5
 jiter==0.8.2
+jmespath==1.0.1
+joblib==1.4.2
+jsonlines==4.0.0
+kiwisolver==1.4.8
+langdetect==1.0.9
+lxml==5.3.1
+MarkupSafe==3.0.2
+matplotlib==3.10.1
+more-itertools==10.6.0
+mpld3==0.5.10
+mpmath==1.3.0
+networkx==3.4.2
+numpy==1.26.4
 openai==1.59.9
 packaging==24.2
 pdf2image==1.17.0
 pillow==11.1.0
+pptree==3.1
+protobuf==5.29.3
+psutil==7.0.0
 pydantic==2.10.5
 pydantic_core==2.27.2
 pydash==8.0.5
+pyparsing==3.2.1
+PySocks==1.7.1
 pytesseract==0.3.13
+python-dateutil==2.9.0.post0
 python-dotenv==1.0.1
 python-multipart==0.0.20
+pytorch_revgrad==0.2.0
 PyYAML==6.0.2
+regex==2024.11.6
+requests==2.32.3
+s3transfer==0.11.3
+safetensors==0.5.3
+scikit-learn==1.6.1
+scipy==1.15.2
+segtok==1.5.11
+sentencepiece==0.2.0
+setuptools==75.8.2
+six==1.17.0
 sniffio==1.3.1
+sortedcontainers==2.4.0
+soupsieve==2.6
+sqlitedict==2.1.0
 starlette==0.41.3
+sympy==1.13.1
+tabulate==0.9.0
+threadpoolctl==3.5.0
+tokenizers==0.21.0
+torch==2.6.0
 tqdm==4.67.1
+transformer-smaller-training-vocab==0.4.0
+transformers==4.49.0
 typing_extensions==4.12.2
+urllib3==2.3.0
 uvicorn==0.34.0
 uvloop==0.21.0
 watchfiles==1.0.4
+wcwidth==0.2.13
 websockets==14.2
+Wikipedia-API==0.8.1
+wrapt==1.17.2