Fix: Embedding err when docx contains unsupported images (#1720)
Browse files### What problem does this PR solve?
Fix the problem of not being able to embedding when docx document
contains unsupported images.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
---------
Co-authored-by: Kevin Hu <[email protected]>
- rag/app/naive.py +11 -4
rag/app/naive.py
CHANGED
|
@@ -23,7 +23,7 @@ from rag.utils import num_tokens_from_string
|
|
| 23 |
from PIL import Image
|
| 24 |
from functools import reduce
|
| 25 |
from markdown import markdown
|
| 26 |
-
|
| 27 |
|
| 28 |
class Docx(DocxParser):
|
| 29 |
def __init__(self):
|
|
@@ -36,9 +36,16 @@ class Docx(DocxParser):
|
|
| 36 |
img = img[0]
|
| 37 |
embed = img.xpath('.//a:blip/@r:embed')[0]
|
| 38 |
related_part = document.part.related_parts[embed]
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
def __clean(self, line):
|
| 44 |
line = re.sub(r"\u3000", " ", line).strip()
|
|
|
|
| 23 |
from PIL import Image
|
| 24 |
from functools import reduce
|
| 25 |
from markdown import markdown
|
| 26 |
+
from docx.image.exceptions import UnrecognizedImageError
|
| 27 |
|
| 28 |
class Docx(DocxParser):
|
| 29 |
def __init__(self):
|
|
|
|
| 36 |
img = img[0]
|
| 37 |
embed = img.xpath('.//a:blip/@r:embed')[0]
|
| 38 |
related_part = document.part.related_parts[embed]
|
| 39 |
+
try:
|
| 40 |
+
image_blob = related_part.image.blob
|
| 41 |
+
except UnrecognizedImageError:
|
| 42 |
+
print("Unrecognized image format. Skipping image.")
|
| 43 |
+
return None
|
| 44 |
+
try:
|
| 45 |
+
image = Image.open(BytesIO(image_blob)).convert('RGB')
|
| 46 |
+
return image
|
| 47 |
+
except Exception as e:
|
| 48 |
+
return None
|
| 49 |
|
| 50 |
def __clean(self, line):
|
| 51 |
line = re.sub(r"\u3000", " ", line).strip()
|