cyhasuka Kevin Hu commited on
Commit
b37fedc
·
1 Parent(s): eb9eea9

Fix: Embedding err when docx contains unsupported images (#1720)

Browse files

### What problem does this PR solve?

Fix the problem of not being able to embedding when docx document
contains unsupported images.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

---------

Co-authored-by: Kevin Hu <[email protected]>

Files changed (1) hide show
  1. rag/app/naive.py +11 -4
rag/app/naive.py CHANGED
@@ -23,7 +23,7 @@ from rag.utils import num_tokens_from_string
23
  from PIL import Image
24
  from functools import reduce
25
  from markdown import markdown
26
-
27
 
28
  class Docx(DocxParser):
29
  def __init__(self):
@@ -36,9 +36,16 @@ class Docx(DocxParser):
36
  img = img[0]
37
  embed = img.xpath('.//a:blip/@r:embed')[0]
38
  related_part = document.part.related_parts[embed]
39
- image = related_part.image
40
- image = Image.open(BytesIO(image.blob)).convert('RGB')
41
- return image
 
 
 
 
 
 
 
42
 
43
  def __clean(self, line):
44
  line = re.sub(r"\u3000", " ", line).strip()
 
23
  from PIL import Image
24
  from functools import reduce
25
  from markdown import markdown
26
+ from docx.image.exceptions import UnrecognizedImageError
27
 
28
  class Docx(DocxParser):
29
  def __init__(self):
 
36
  img = img[0]
37
  embed = img.xpath('.//a:blip/@r:embed')[0]
38
  related_part = document.part.related_parts[embed]
39
+ try:
40
+ image_blob = related_part.image.blob
41
+ except UnrecognizedImageError:
42
+ print("Unrecognized image format. Skipping image.")
43
+ return None
44
+ try:
45
+ image = Image.open(BytesIO(image_blob)).convert('RGB')
46
+ return image
47
+ except Exception as e:
48
+ return None
49
 
50
  def __clean(self, line):
51
  line = re.sub(r"\u3000", " ", line).strip()