aopstudio commited on
Commit
4adcb3c
·
1 Parent(s): 4417740

Support displaying images in the chunks of docx files when using general parser (#1253)

Browse files

### What problem does this PR solve?

Support displaying images in chunks of docx files when using general
parser

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

Files changed (3) hide show
  1. rag/app/naive.py +53 -6
  2. rag/app/qa.py +3 -21
  3. rag/nlp/__init__.py +65 -1
rag/app/naive.py CHANGED
@@ -16,16 +16,28 @@ from docx import Document
16
  from timeit import default_timer as timer
17
  import re
18
  from deepdoc.parser.pdf_parser import PlainParser
19
- from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec
20
  from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser
21
  from rag.settings import cron_logger
22
  from rag.utils import num_tokens_from_string
23
-
 
24
 
25
  class Docx(DocxParser):
26
  def __init__(self):
27
  pass
28
 
 
 
 
 
 
 
 
 
 
 
 
29
  def __clean(self, line):
30
  line = re.sub(r"\u3000", " ", line).strip()
31
  return line
@@ -35,17 +47,41 @@ class Docx(DocxParser):
35
  filename) if not binary else Document(BytesIO(binary))
36
  pn = 0
37
  lines = []
 
38
  for p in self.doc.paragraphs:
39
  if pn > to_page:
40
  break
41
- if from_page <= pn < to_page and p.text.strip():
42
- lines.append(self.__clean(p.text))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  for run in p.runs:
44
  if 'lastRenderedPageBreak' in run._element.xml:
45
  pn += 1
46
  continue
47
  if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
48
  pn += 1
 
49
  tbls = []
50
  for tb in self.doc.tables:
51
  html= "<table>"
@@ -64,7 +100,7 @@ class Docx(DocxParser):
64
  html += "</tr>"
65
  html += "</table>"
66
  tbls.append(((None, html), ""))
67
- return [(l, "") for l in lines if l], tbls
68
 
69
 
70
  class Pdf(PdfParser):
@@ -123,8 +159,19 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
123
  if re.search(r"\.docx$", filename, re.IGNORECASE):
124
  callback(0.1, "Start to parse.")
125
  sections, tbls = Docx()(filename, binary)
126
- res = tokenize_table(tbls, doc, eng)
 
127
  callback(0.8, "Finish parsing.")
 
 
 
 
 
 
 
 
 
 
128
 
129
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
130
  pdf_parser = Pdf(
 
16
  from timeit import default_timer as timer
17
  import re
18
  from deepdoc.parser.pdf_parser import PlainParser
19
+ from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, naive_merge_docx, tokenize_chunks_docx
20
  from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser
21
  from rag.settings import cron_logger
22
  from rag.utils import num_tokens_from_string
23
+ from PIL import Image
24
+ from functools import reduce
25
 
26
  class Docx(DocxParser):
27
  def __init__(self):
28
  pass
29
 
30
+ def get_picture(self, document, paragraph):
31
+ img = paragraph._element.xpath('.//pic:pic')
32
+ if not img:
33
+ return None
34
+ img = img[0]
35
+ embed = img.xpath('.//a:blip/@r:embed')[0]
36
+ related_part = document.part.related_parts[embed]
37
+ image = related_part.image
38
+ image = Image.open(BytesIO(image.blob)).convert('RGB')
39
+ return image
40
+
41
  def __clean(self, line):
42
  line = re.sub(r"\u3000", " ", line).strip()
43
  return line
 
47
  filename) if not binary else Document(BytesIO(binary))
48
  pn = 0
49
  lines = []
50
+ last_image = None
51
  for p in self.doc.paragraphs:
52
  if pn > to_page:
53
  break
54
+ if from_page <= pn < to_page:
55
+ current_image = None
56
+ if p.text.strip():
57
+ if p.style.name == 'Caption':
58
+ former_image = None
59
+ if lines and lines[-1][1] and lines[-1][2] != 'Caption':
60
+ former_image = lines[-1][1].pop()
61
+ elif last_image:
62
+ former_image = last_image
63
+ last_image = None
64
+ lines.append((self.__clean(p.text), [former_image], p.style.name))
65
+ else:
66
+ current_image = self.get_picture(self.doc, p)
67
+ image_list = [current_image]
68
+ if last_image:
69
+ image_list.insert(0, last_image)
70
+ last_image = None
71
+ lines.append((self.__clean(p.text), image_list, p.style.name))
72
+ else:
73
+ if current_image := self.get_picture(self.doc, p):
74
+ if lines:
75
+ lines[-1][1].append(current_image)
76
+ else:
77
+ last_image = current_image
78
  for run in p.runs:
79
  if 'lastRenderedPageBreak' in run._element.xml:
80
  pn += 1
81
  continue
82
  if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
83
  pn += 1
84
+ new_line = [(line[0], reduce(concat_img, line[1])) for line in lines]
85
  tbls = []
86
  for tb in self.doc.tables:
87
  html= "<table>"
 
100
  html += "</tr>"
101
  html += "</table>"
102
  tbls.append(((None, html), ""))
103
+ return new_line, tbls
104
 
105
 
106
  class Pdf(PdfParser):
 
159
  if re.search(r"\.docx$", filename, re.IGNORECASE):
160
  callback(0.1, "Start to parse.")
161
  sections, tbls = Docx()(filename, binary)
162
+ res = tokenize_table(tbls, doc, eng) # just for table
163
+
164
  callback(0.8, "Finish parsing.")
165
+ st = timer()
166
+
167
+ chunks, images = naive_merge_docx(
168
+ sections, int(parser_config.get(
169
+ "chunk_token_num", 128)), parser_config.get(
170
+ "delimiter", "\n!?。;!?"))
171
+
172
+ res.extend(tokenize_chunks_docx(chunks, doc, eng, images))
173
+ cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
174
+ return res
175
 
176
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
177
  pdf_parser = Pdf(
rag/app/qa.py CHANGED
@@ -17,7 +17,7 @@ from timeit import default_timer as timer
17
  from nltk import word_tokenize
18
  from openpyxl import load_workbook
19
  from rag.nlp import is_english, random_choices, find_codec, qbullets_category, add_positions, has_qbullet, docx_question_level
20
- from rag.nlp import rag_tokenizer, tokenize_table
21
  from rag.settings import cron_logger
22
  from deepdoc.parser import PdfParser, ExcelParser, DocxParser
23
  from docx import Document
@@ -174,26 +174,8 @@ class Docx(DocxParser):
174
  embed = img.xpath('.//a:blip/@r:embed')[0]
175
  related_part = document.part.related_parts[embed]
176
  image = related_part.image
177
- image = Image.open(BytesIO(image.blob))
178
  return image
179
- def concat_img(self, img1, img2):
180
- if img1 and not img2:
181
- return img1
182
- if not img1 and img2:
183
- return img2
184
- if not img1 and not img2:
185
- return None
186
- width1, height1 = img1.size
187
- width2, height2 = img2.size
188
-
189
- new_width = max(width1, width2)
190
- new_height = height1 + height2
191
- new_image = Image.new('RGB', (new_width, new_height))
192
-
193
- new_image.paste(img1, (0, 0))
194
- new_image.paste(img2, (0, height1))
195
-
196
- return new_image
197
 
198
  def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None):
199
  self.doc = Document(
@@ -211,7 +193,7 @@ class Docx(DocxParser):
211
  if not question_level or question_level > 6: # not a question
212
  last_answer = f'{last_answer}\n{p_text}'
213
  current_image = self.get_picture(self.doc, p)
214
- last_image = self.concat_img(last_image, current_image)
215
  else: # is a question
216
  if last_answer or last_image:
217
  sum_question = '\n'.join(question_stack)
 
17
  from nltk import word_tokenize
18
  from openpyxl import load_workbook
19
  from rag.nlp import is_english, random_choices, find_codec, qbullets_category, add_positions, has_qbullet, docx_question_level
20
+ from rag.nlp import rag_tokenizer, tokenize_table, concat_img
21
  from rag.settings import cron_logger
22
  from deepdoc.parser import PdfParser, ExcelParser, DocxParser
23
  from docx import Document
 
174
  embed = img.xpath('.//a:blip/@r:embed')[0]
175
  related_part = document.part.related_parts[embed]
176
  image = related_part.image
177
+ image = Image.open(BytesIO(image.blob)).convert('RGB')
178
  return image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
 
180
  def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None):
181
  self.doc = Document(
 
193
  if not question_level or question_level > 6: # not a question
194
  last_answer = f'{last_answer}\n{p_text}'
195
  current_image = self.get_picture(self.doc, p)
196
+ last_image = concat_img(last_image, current_image)
197
  else: # is a question
198
  if last_answer or last_image:
199
  sum_question = '\n'.join(question_stack)
rag/nlp/__init__.py CHANGED
@@ -24,6 +24,7 @@ import copy
24
  import roman_numbers as r
25
  from word2number import w2n
26
  from cn2an import cn2an
 
27
 
28
  all_codecs = [
29
  'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs',
@@ -246,6 +247,19 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser):
246
  return res
247
 
248
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
  def tokenize_table(tbls, doc, eng, batch_size=10):
250
  res = []
251
  # add tables
@@ -504,4 +518,54 @@ def docx_question_level(p):
504
  if p.style.name.startswith('Heading'):
505
  return int(p.style.name.split(' ')[-1]), re.sub(r"\u3000", " ", p.text).strip()
506
  else:
507
- return 0, re.sub(r"\u3000", " ", p.text).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  import roman_numbers as r
25
  from word2number import w2n
26
  from cn2an import cn2an
27
+ from PIL import Image
28
 
29
  all_codecs = [
30
  'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs',
 
247
  return res
248
 
249
 
250
+ def tokenize_chunks_docx(chunks, doc, eng, images):
251
+ res = []
252
+ # wrap up as es documents
253
+ for ck, image in zip(chunks, images):
254
+ if len(ck.strip()) == 0:continue
255
+ print("--", ck)
256
+ d = copy.deepcopy(doc)
257
+ d["image"] = image
258
+ tokenize(d, ck, eng)
259
+ res.append(d)
260
+ return res
261
+
262
+
263
  def tokenize_table(tbls, doc, eng, batch_size=10):
264
  res = []
265
  # add tables
 
518
  if p.style.name.startswith('Heading'):
519
  return int(p.style.name.split(' ')[-1]), re.sub(r"\u3000", " ", p.text).strip()
520
  else:
521
+ return 0, re.sub(r"\u3000", " ", p.text).strip()
522
+
523
+ def concat_img(img1, img2):
524
+ if img1 and not img2:
525
+ return img1
526
+ if not img1 and img2:
527
+ return img2
528
+ if not img1 and not img2:
529
+ return None
530
+ width1, height1 = img1.size
531
+ width2, height2 = img2.size
532
+
533
+ new_width = max(width1, width2)
534
+ new_height = height1 + height2
535
+ new_image = Image.new('RGB', (new_width, new_height))
536
+
537
+ new_image.paste(img1, (0, 0))
538
+ new_image.paste(img2, (0, height1))
539
+
540
+ return new_image
541
+
542
+ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"):
543
+ if not sections:
544
+ return []
545
+
546
+ cks = [""]
547
+ images = [None]
548
+ tk_nums = [0]
549
+
550
+ def add_chunk(t, image, pos=""):
551
+ nonlocal cks, tk_nums, delimiter
552
+ tnum = num_tokens_from_string(t)
553
+ if tnum < 8:
554
+ pos = ""
555
+ if tk_nums[-1] > chunk_token_num:
556
+ if t.find(pos) < 0:
557
+ t += pos
558
+ cks.append(t)
559
+ images.append(image)
560
+ tk_nums.append(tnum)
561
+ else:
562
+ if cks[-1].find(pos) < 0:
563
+ t += pos
564
+ cks[-1] += t
565
+ images[-1] = concat_img(images[-1], image)
566
+ tk_nums[-1] += tnum
567
+
568
+ for sec, image in sections:
569
+ add_chunk(sec, image, '')
570
+
571
+ return cks, images