Spaces:
Sleeping
Sleeping
add ner
Browse files- ocr/api/message/utils.py +16 -0
- ocr/api/message/views.py +2 -6
- ocr/core/config.py +2 -0
- requirements.txt +65 -0
ocr/api/message/utils.py
CHANGED
|
@@ -4,8 +4,11 @@ import re
|
|
| 4 |
|
| 5 |
import pytesseract
|
| 6 |
from PIL import Image
|
|
|
|
| 7 |
from pdf2image import convert_from_bytes
|
| 8 |
|
|
|
|
|
|
|
| 9 |
|
| 10 |
def divide_images(contents: bytes) -> list[bytes]:
|
| 11 |
images = convert_from_bytes(contents, dpi=250)
|
|
@@ -49,3 +52,16 @@ def clean_response(text: str) -> str:
|
|
| 49 |
except Exception as e:
|
| 50 |
pass
|
| 51 |
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
import pytesseract
|
| 6 |
from PIL import Image
|
| 7 |
+
from flair.data import Sentence
|
| 8 |
from pdf2image import convert_from_bytes
|
| 9 |
|
| 10 |
+
from ocr.core.config import settings
|
| 11 |
+
|
| 12 |
|
| 13 |
def divide_images(contents: bytes) -> list[bytes]:
|
| 14 |
images = convert_from_bytes(contents, dpi=250)
|
|
|
|
| 52 |
except Exception as e:
|
| 53 |
pass
|
| 54 |
return text
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def clean_text(text: str) -> str:
|
| 58 |
+
sentence = Sentence(text)
|
| 59 |
+
settings.TAGGER.predict(sentence)
|
| 60 |
+
per_entities = [entity for entity in sentence.get_spans('ner') if entity.tag == 'PER']
|
| 61 |
+
per_entities = sorted(per_entities, key=lambda x: x.start_position, reverse=True)
|
| 62 |
+
cleaned_text = text
|
| 63 |
+
for entity in per_entities:
|
| 64 |
+
start = entity.start_position
|
| 65 |
+
end = entity.end_position
|
| 66 |
+
cleaned_text = cleaned_text[:start] + cleaned_text[end:]
|
| 67 |
+
return cleaned_text
|
ocr/api/message/views.py
CHANGED
|
@@ -2,7 +2,7 @@ from fastapi import File, UploadFile, HTTPException
|
|
| 2 |
|
| 3 |
from ocr.api.message import ocr_router
|
| 4 |
from ocr.api.message.schemas import OcrResponse
|
| 5 |
-
from ocr.api.message.utils import divide_images, clean_response, extract_text_from_images
|
| 6 |
from ocr.core.wrappers import OcrResponseWrapper
|
| 7 |
|
| 8 |
|
|
@@ -21,11 +21,7 @@ async def get_all_chat_messages(
|
|
| 21 |
else:
|
| 22 |
raise HTTPException(status_code=400, detail='Unsupported file type.')
|
| 23 |
text_content = extract_text_from_images(images)
|
| 24 |
-
|
| 25 |
-
# extract_original_text(text_content),
|
| 26 |
-
# generate_report(text_content)
|
| 27 |
-
# )
|
| 28 |
-
cleaned_original_text = text_content
|
| 29 |
return OcrResponseWrapper(data=OcrResponse(text=clean_response(text_content), originalText=cleaned_original_text))
|
| 30 |
finally:
|
| 31 |
await file.close()
|
|
|
|
| 2 |
|
| 3 |
from ocr.api.message import ocr_router
|
| 4 |
from ocr.api.message.schemas import OcrResponse
|
| 5 |
+
from ocr.api.message.utils import divide_images, clean_response, extract_text_from_images, clean_text
|
| 6 |
from ocr.core.wrappers import OcrResponseWrapper
|
| 7 |
|
| 8 |
|
|
|
|
| 21 |
else:
|
| 22 |
raise HTTPException(status_code=400, detail='Unsupported file type.')
|
| 23 |
text_content = extract_text_from_images(images)
|
| 24 |
+
cleaned_original_text = clean_text(text_content)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
return OcrResponseWrapper(data=OcrResponse(text=clean_response(text_content), originalText=cleaned_original_text))
|
| 26 |
finally:
|
| 27 |
await file.close()
|
ocr/core/config.py
CHANGED
|
@@ -3,12 +3,14 @@ import pathlib
|
|
| 3 |
from functools import lru_cache
|
| 4 |
|
| 5 |
from dotenv import load_dotenv
|
|
|
|
| 6 |
|
| 7 |
load_dotenv()
|
| 8 |
|
| 9 |
class BaseConfig:
|
| 10 |
BASE_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent
|
| 11 |
SECRET_KEY = os.getenv('SECRET')
|
|
|
|
| 12 |
|
| 13 |
class DevelopmentConfig(BaseConfig):
|
| 14 |
Issuer = "http://localhost:8000"
|
|
|
|
| 3 |
from functools import lru_cache
|
| 4 |
|
| 5 |
from dotenv import load_dotenv
|
| 6 |
+
from flair.models import SequenceTagger
|
| 7 |
|
| 8 |
load_dotenv()
|
| 9 |
|
| 10 |
class BaseConfig:
|
| 11 |
BASE_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent
|
| 12 |
SECRET_KEY = os.getenv('SECRET')
|
| 13 |
+
TAGGER = SequenceTagger.load("flair/ner-english-large")
|
| 14 |
|
| 15 |
class DevelopmentConfig(BaseConfig):
|
| 16 |
Issuer = "http://localhost:8000"
|
requirements.txt
CHANGED
|
@@ -1,31 +1,96 @@
|
|
|
|
|
| 1 |
annotated-types==0.7.0
|
| 2 |
anyio==4.8.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
certifi==2024.12.14
|
|
|
|
| 4 |
click==8.1.8
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
distro==1.9.0
|
|
|
|
| 6 |
fastapi==0.115.6
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
h11==0.14.0
|
| 8 |
httpcore==1.0.7
|
| 9 |
httptools==0.6.4
|
| 10 |
httpx==0.28.1
|
|
|
|
| 11 |
idna==3.10
|
|
|
|
|
|
|
| 12 |
jiter==0.8.2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
openai==1.59.9
|
| 14 |
packaging==24.2
|
| 15 |
pdf2image==1.17.0
|
| 16 |
pillow==11.1.0
|
|
|
|
|
|
|
|
|
|
| 17 |
pydantic==2.10.5
|
| 18 |
pydantic_core==2.27.2
|
| 19 |
pydash==8.0.5
|
|
|
|
|
|
|
| 20 |
pytesseract==0.3.13
|
|
|
|
| 21 |
python-dotenv==1.0.1
|
| 22 |
python-multipart==0.0.20
|
|
|
|
| 23 |
PyYAML==6.0.2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
sniffio==1.3.1
|
|
|
|
|
|
|
|
|
|
| 25 |
starlette==0.41.3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
tqdm==4.67.1
|
|
|
|
|
|
|
| 27 |
typing_extensions==4.12.2
|
|
|
|
| 28 |
uvicorn==0.34.0
|
| 29 |
uvloop==0.21.0
|
| 30 |
watchfiles==1.0.4
|
|
|
|
| 31 |
websockets==14.2
|
|
|
|
|
|
|
|
|
| 1 |
+
accelerate==1.4.0
|
| 2 |
annotated-types==0.7.0
|
| 3 |
anyio==4.8.0
|
| 4 |
+
attrs==25.1.0
|
| 5 |
+
beautifulsoup4==4.13.3
|
| 6 |
+
bioc==2.1
|
| 7 |
+
boto3==1.37.4
|
| 8 |
+
botocore==1.37.4
|
| 9 |
certifi==2024.12.14
|
| 10 |
+
charset-normalizer==3.4.1
|
| 11 |
click==8.1.8
|
| 12 |
+
conllu==4.5.3
|
| 13 |
+
contourpy==1.3.1
|
| 14 |
+
cycler==0.12.1
|
| 15 |
+
Deprecated==1.2.18
|
| 16 |
distro==1.9.0
|
| 17 |
+
docopt==0.6.2
|
| 18 |
fastapi==0.115.6
|
| 19 |
+
filelock==3.17.0
|
| 20 |
+
flair==0.15.1
|
| 21 |
+
fonttools==4.56.0
|
| 22 |
+
fsspec==2025.2.0
|
| 23 |
+
ftfy==6.3.1
|
| 24 |
+
gdown==5.2.0
|
| 25 |
h11==0.14.0
|
| 26 |
httpcore==1.0.7
|
| 27 |
httptools==0.6.4
|
| 28 |
httpx==0.28.1
|
| 29 |
+
huggingface-hub==0.29.1
|
| 30 |
idna==3.10
|
| 31 |
+
intervaltree==3.1.0
|
| 32 |
+
Jinja2==3.1.5
|
| 33 |
jiter==0.8.2
|
| 34 |
+
jmespath==1.0.1
|
| 35 |
+
joblib==1.4.2
|
| 36 |
+
jsonlines==4.0.0
|
| 37 |
+
kiwisolver==1.4.8
|
| 38 |
+
langdetect==1.0.9
|
| 39 |
+
lxml==5.3.1
|
| 40 |
+
MarkupSafe==3.0.2
|
| 41 |
+
matplotlib==3.10.1
|
| 42 |
+
more-itertools==10.6.0
|
| 43 |
+
mpld3==0.5.10
|
| 44 |
+
mpmath==1.3.0
|
| 45 |
+
networkx==3.4.2
|
| 46 |
+
numpy==1.26.4
|
| 47 |
openai==1.59.9
|
| 48 |
packaging==24.2
|
| 49 |
pdf2image==1.17.0
|
| 50 |
pillow==11.1.0
|
| 51 |
+
pptree==3.1
|
| 52 |
+
protobuf==5.29.3
|
| 53 |
+
psutil==7.0.0
|
| 54 |
pydantic==2.10.5
|
| 55 |
pydantic_core==2.27.2
|
| 56 |
pydash==8.0.5
|
| 57 |
+
pyparsing==3.2.1
|
| 58 |
+
PySocks==1.7.1
|
| 59 |
pytesseract==0.3.13
|
| 60 |
+
python-dateutil==2.9.0.post0
|
| 61 |
python-dotenv==1.0.1
|
| 62 |
python-multipart==0.0.20
|
| 63 |
+
pytorch_revgrad==0.2.0
|
| 64 |
PyYAML==6.0.2
|
| 65 |
+
regex==2024.11.6
|
| 66 |
+
requests==2.32.3
|
| 67 |
+
s3transfer==0.11.3
|
| 68 |
+
safetensors==0.5.3
|
| 69 |
+
scikit-learn==1.6.1
|
| 70 |
+
scipy==1.15.2
|
| 71 |
+
segtok==1.5.11
|
| 72 |
+
sentencepiece==0.2.0
|
| 73 |
+
setuptools==75.8.2
|
| 74 |
+
six==1.17.0
|
| 75 |
sniffio==1.3.1
|
| 76 |
+
sortedcontainers==2.4.0
|
| 77 |
+
soupsieve==2.6
|
| 78 |
+
sqlitedict==2.1.0
|
| 79 |
starlette==0.41.3
|
| 80 |
+
sympy==1.13.1
|
| 81 |
+
tabulate==0.9.0
|
| 82 |
+
threadpoolctl==3.5.0
|
| 83 |
+
tokenizers==0.21.0
|
| 84 |
+
torch==2.6.0
|
| 85 |
tqdm==4.67.1
|
| 86 |
+
transformer-smaller-training-vocab==0.4.0
|
| 87 |
+
transformers==4.49.0
|
| 88 |
typing_extensions==4.12.2
|
| 89 |
+
urllib3==2.3.0
|
| 90 |
uvicorn==0.34.0
|
| 91 |
uvloop==0.21.0
|
| 92 |
watchfiles==1.0.4
|
| 93 |
+
wcwidth==0.2.13
|
| 94 |
websockets==14.2
|
| 95 |
+
Wikipedia-API==0.8.1
|
| 96 |
+
wrapt==1.17.2
|