KevinHuSh
commited on
Commit
·
5db8a67
1
Parent(s):
be5bfb7
remove PyMuPDF (#618)
Browse files### What problem does this PR solve?
#613
### Type of change
- [x] Other (please describe):
- api/utils/file_utils.py +3 -5
- deepdoc/parser/pdf_parser.py +2 -21
- deepdoc/vision/__init__.py +7 -8
- rag/utils/minio_conn.py +1 -1
- requirements.txt +0 -2
api/utils/file_utils.py
CHANGED
|
@@ -19,7 +19,7 @@ import os
|
|
| 19 |
import re
|
| 20 |
from io import BytesIO
|
| 21 |
|
| 22 |
-
import
|
| 23 |
from PIL import Image
|
| 24 |
from cachetools import LRUCache, cached
|
| 25 |
from ruamel.yaml import YAML
|
|
@@ -172,11 +172,9 @@ def filename_type(filename):
|
|
| 172 |
def thumbnail(filename, blob):
|
| 173 |
filename = filename.lower()
|
| 174 |
if re.match(r".*\.pdf$", filename):
|
| 175 |
-
pdf =
|
| 176 |
-
pix = pdf[0].get_pixmap(matrix=fitz.Matrix(0.03, 0.03))
|
| 177 |
buffered = BytesIO()
|
| 178 |
-
|
| 179 |
-
pix.samples).save(buffered, format="png")
|
| 180 |
return "data:image/png;base64," + \
|
| 181 |
base64.b64encode(buffered.getvalue()).decode("utf-8")
|
| 182 |
|
|
|
|
| 19 |
import re
|
| 20 |
from io import BytesIO
|
| 21 |
|
| 22 |
+
import pdfplumber
|
| 23 |
from PIL import Image
|
| 24 |
from cachetools import LRUCache, cached
|
| 25 |
from ruamel.yaml import YAML
|
|
|
|
| 172 |
def thumbnail(filename, blob):
|
| 173 |
filename = filename.lower()
|
| 174 |
if re.match(r".*\.pdf$", filename):
|
| 175 |
+
pdf = pdfplumber.open(BytesIO(blob))
|
|
|
|
| 176 |
buffered = BytesIO()
|
| 177 |
+
pdf.pages[0].to_image().annotated.save(buffered, format="png")
|
|
|
|
| 178 |
return "data:image/png;base64," + \
|
| 179 |
base64.b64encode(buffered.getvalue()).decode("utf-8")
|
| 180 |
|
deepdoc/parser/pdf_parser.py
CHANGED
|
@@ -2,7 +2,6 @@
|
|
| 2 |
import os
|
| 3 |
import random
|
| 4 |
|
| 5 |
-
import fitz
|
| 6 |
import xgboost as xgb
|
| 7 |
from io import BytesIO
|
| 8 |
import torch
|
|
@@ -922,9 +921,7 @@ class RAGFlowPdfParser:
|
|
| 922 |
fnm) if not binary else pdfplumber.open(BytesIO(binary))
|
| 923 |
return len(pdf.pages)
|
| 924 |
except Exception as e:
|
| 925 |
-
|
| 926 |
-
stream=fnm, filetype="pdf")
|
| 927 |
-
return len(pdf)
|
| 928 |
|
| 929 |
def __images__(self, fnm, zoomin=3, page_from=0,
|
| 930 |
page_to=299, callback=None):
|
|
@@ -946,23 +943,7 @@ class RAGFlowPdfParser:
|
|
| 946 |
self.pdf.pages[page_from:page_to]]
|
| 947 |
self.total_page = len(self.pdf.pages)
|
| 948 |
except Exception as e:
|
| 949 |
-
|
| 950 |
-
fnm, str) else fitz.open(
|
| 951 |
-
stream=fnm, filetype="pdf")
|
| 952 |
-
self.page_images = []
|
| 953 |
-
self.page_chars = []
|
| 954 |
-
mat = fitz.Matrix(zoomin, zoomin)
|
| 955 |
-
self.total_page = len(self.pdf)
|
| 956 |
-
for i, page in enumerate(self.pdf):
|
| 957 |
-
if i < page_from:
|
| 958 |
-
continue
|
| 959 |
-
if i >= page_to:
|
| 960 |
-
break
|
| 961 |
-
pix = page.get_pixmap(matrix=mat)
|
| 962 |
-
img = Image.frombytes("RGB", [pix.width, pix.height],
|
| 963 |
-
pix.samples)
|
| 964 |
-
self.page_images.append(img)
|
| 965 |
-
self.page_chars.append([])
|
| 966 |
|
| 967 |
self.outlines = []
|
| 968 |
try:
|
|
|
|
| 2 |
import os
|
| 3 |
import random
|
| 4 |
|
|
|
|
| 5 |
import xgboost as xgb
|
| 6 |
from io import BytesIO
|
| 7 |
import torch
|
|
|
|
| 921 |
fnm) if not binary else pdfplumber.open(BytesIO(binary))
|
| 922 |
return len(pdf.pages)
|
| 923 |
except Exception as e:
|
| 924 |
+
logging.error(str(e))
|
|
|
|
|
|
|
| 925 |
|
| 926 |
def __images__(self, fnm, zoomin=3, page_from=0,
|
| 927 |
page_to=299, callback=None):
|
|
|
|
| 943 |
self.pdf.pages[page_from:page_to]]
|
| 944 |
self.total_page = len(self.pdf.pages)
|
| 945 |
except Exception as e:
|
| 946 |
+
logging.error(str(e))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 947 |
|
| 948 |
self.outlines = []
|
| 949 |
try:
|
deepdoc/vision/__init__.py
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
|
|
| 1 |
|
| 2 |
from .ocr import OCR
|
| 3 |
from .recognizer import Recognizer
|
| 4 |
from .layout_recognizer import LayoutRecognizer
|
| 5 |
from .table_structure_recognizer import TableStructureRecognizer
|
| 6 |
|
|
|
|
| 7 |
def init_in_out(args):
|
| 8 |
from PIL import Image
|
| 9 |
-
import fitz
|
| 10 |
import os
|
| 11 |
import traceback
|
| 12 |
from api.utils.file_utils import traversal_files
|
|
@@ -18,13 +19,11 @@ def init_in_out(args):
|
|
| 18 |
|
| 19 |
def pdf_pages(fnm, zoomin=3):
|
| 20 |
nonlocal outputs, images
|
| 21 |
-
pdf =
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
pix.samples)
|
| 27 |
-
images.append(img)
|
| 28 |
outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg")
|
| 29 |
|
| 30 |
def images_and_outputs(fnm):
|
|
|
|
| 1 |
+
import pdfplumber
|
| 2 |
|
| 3 |
from .ocr import OCR
|
| 4 |
from .recognizer import Recognizer
|
| 5 |
from .layout_recognizer import LayoutRecognizer
|
| 6 |
from .table_structure_recognizer import TableStructureRecognizer
|
| 7 |
|
| 8 |
+
|
| 9 |
def init_in_out(args):
|
| 10 |
from PIL import Image
|
|
|
|
| 11 |
import os
|
| 12 |
import traceback
|
| 13 |
from api.utils.file_utils import traversal_files
|
|
|
|
| 19 |
|
| 20 |
def pdf_pages(fnm, zoomin=3):
|
| 21 |
nonlocal outputs, images
|
| 22 |
+
pdf = pdfplumber.open(fnm)
|
| 23 |
+
images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
|
| 24 |
+
enumerate(pdf.pages)]
|
| 25 |
+
|
| 26 |
+
for i, page in enumerate(images):
|
|
|
|
|
|
|
| 27 |
outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg")
|
| 28 |
|
| 29 |
def images_and_outputs(fnm):
|
rag/utils/minio_conn.py
CHANGED
|
@@ -35,7 +35,7 @@ class RAGFlowMinio(object):
|
|
| 35 |
self.conn = None
|
| 36 |
|
| 37 |
def put(self, bucket, fnm, binary):
|
| 38 |
-
for _ in range(
|
| 39 |
try:
|
| 40 |
if not self.conn.bucket_exists(bucket):
|
| 41 |
self.conn.make_bucket(bucket)
|
|
|
|
| 35 |
self.conn = None
|
| 36 |
|
| 37 |
def put(self, bucket, fnm, binary):
|
| 38 |
+
for _ in range(3):
|
| 39 |
try:
|
| 40 |
if not self.conn.bucket_exists(bucket):
|
| 41 |
self.conn.make_bucket(bucket)
|
requirements.txt
CHANGED
|
@@ -91,8 +91,6 @@ pycryptodomex==3.20.0
|
|
| 91 |
pydantic==2.6.2
|
| 92 |
pydantic_core==2.16.3
|
| 93 |
PyJWT==2.8.0
|
| 94 |
-
PyMuPDF==1.23.25
|
| 95 |
-
PyMuPDFb==1.23.22
|
| 96 |
PyMySQL==1.1.0
|
| 97 |
PyPDF2==3.0.1
|
| 98 |
pypdfium2==4.27.0
|
|
|
|
| 91 |
pydantic==2.6.2
|
| 92 |
pydantic_core==2.16.3
|
| 93 |
PyJWT==2.8.0
|
|
|
|
|
|
|
| 94 |
PyMySQL==1.1.0
|
| 95 |
PyPDF2==3.0.1
|
| 96 |
pypdfium2==4.27.0
|