aopstudio commited on
Commit
867015d
·
1 Parent(s): ef53081

Add docx support for QA parser (#1213)

Browse files

### What problem does this PR solve?

Add docx support for QA parser

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

Files changed (1) hide show
  1. rag/app/qa.py +120 -3
rag/app/qa.py CHANGED
@@ -19,7 +19,9 @@ from openpyxl import load_workbook
19
  from rag.nlp import is_english, random_choices, find_codec, qbullets_category, add_positions, has_qbullet
20
  from rag.nlp import rag_tokenizer, tokenize_table
21
  from rag.settings import cron_logger
22
- from deepdoc.parser import PdfParser, ExcelParser
 
 
23
  class Excel(ExcelParser):
24
  def __call__(self, fnm, binary=None, callback=None):
25
  if not binary:
@@ -119,7 +121,99 @@ class Pdf(PdfParser):
119
  if last_q:
120
  qai_list.append((last_q, last_a, *self.crop(last_tag, need_position=True)))
121
  return qai_list, tbls
122
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  def rmPrefix(txt):
124
  return re.sub(
125
  r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:: ]+", "", txt.strip(), flags=re.IGNORECASE)
@@ -136,6 +230,16 @@ def beAdocPdf(d, q, a, eng, image, poss):
136
  add_positions(d, poss)
137
  return d
138
 
 
 
 
 
 
 
 
 
 
 
139
  def beAdoc(d, q, a, eng):
140
  qprefix = "Question: " if eng else "问题:"
141
  aprefix = "Answer: " if eng else "回答:"
@@ -150,6 +254,11 @@ def mdQuestionLevel(s):
150
  match = re.match(r'#*', s)
151
  return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
152
 
 
 
 
 
 
153
 
154
  def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
155
  """
@@ -278,9 +387,17 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
278
  if sum_question:
279
  res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
280
  return res
 
 
 
 
 
 
 
 
281
 
282
  raise NotImplementedError(
283
- "Excel, csv(txt), pdf and markdown format files are supported.")
284
 
285
 
286
  if __name__ == "__main__":
 
19
  from rag.nlp import is_english, random_choices, find_codec, qbullets_category, add_positions, has_qbullet
20
  from rag.nlp import rag_tokenizer, tokenize_table
21
  from rag.settings import cron_logger
22
+ from deepdoc.parser import PdfParser, ExcelParser, DocxParser
23
+ from docx import Document
24
+ from PIL import Image
25
  class Excel(ExcelParser):
26
  def __call__(self, fnm, binary=None, callback=None):
27
  if not binary:
 
121
  if last_q:
122
  qai_list.append((last_q, last_a, *self.crop(last_tag, need_position=True)))
123
  return qai_list, tbls
124
+ class Docx(DocxParser):
125
+ def __init__(self):
126
+ pass
127
+ def get_picture(self, document, paragraph):
128
+ img = paragraph._element.xpath('.//pic:pic')
129
+ if not img:
130
+ return None
131
+ img = img[0]
132
+ embed = img.xpath('.//a:blip/@r:embed')[0]
133
+ related_part = document.part.related_parts[embed]
134
+ image = related_part.image
135
+ image = Image.open(BytesIO(image.blob))
136
+ return image
137
+ def concat_img(self, img1, img2):
138
+ if img1 and not img2:
139
+ return img1
140
+ if not img1 and img2:
141
+ return img2
142
+ if not img1 and not img2:
143
+ return None
144
+ width1, height1 = img1.size
145
+ width2, height2 = img2.size
146
+
147
+ new_width = max(width1, width2)
148
+ new_height = height1 + height2
149
+ new_image = Image.new('RGB', (new_width, new_height))
150
+
151
+ new_image.paste(img1, (0, 0))
152
+ new_image.paste(img2, (0, height1))
153
+
154
+ return new_image
155
+
156
+ def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None):
157
+ self.doc = Document(
158
+ filename) if not binary else Document(BytesIO(binary))
159
+ pn = 0
160
+ last_answer, last_image = "", None
161
+ question_stack, level_stack = [], []
162
+ qai_list = []
163
+ for p in self.doc.paragraphs:
164
+ if pn > to_page:
165
+ break
166
+ question_level, p_text = 0, ''
167
+ if from_page <= pn < to_page and p.text.strip():
168
+ question_level, p_text = docxQuestionLevel(p)
169
+ if not question_level or question_level > 6: # not a question
170
+ last_answer = f'{last_answer}\n{p_text}'
171
+ current_image = self.get_picture(self.doc, p)
172
+ last_image = self.concat_img(last_image, current_image)
173
+ else: # is a question
174
+ if last_answer or last_image:
175
+ sum_question = '\n'.join(question_stack)
176
+ if sum_question:
177
+ qai_list.append((sum_question, last_answer, last_image))
178
+ last_answer, last_image = '', None
179
+
180
+ i = question_level
181
+ while question_stack and i <= level_stack[-1]:
182
+ question_stack.pop()
183
+ level_stack.pop()
184
+ question_stack.append(p_text)
185
+ level_stack.append(question_level)
186
+ for run in p.runs:
187
+ if 'lastRenderedPageBreak' in run._element.xml:
188
+ pn += 1
189
+ continue
190
+ if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
191
+ pn += 1
192
+ if last_answer:
193
+ sum_question = '\n'.join(question_stack)
194
+ if sum_question:
195
+ qai_list.append((sum_question, last_answer, last_image))
196
+
197
+ tbls = []
198
+ for tb in self.doc.tables:
199
+ html= "<table>"
200
+ for r in tb.rows:
201
+ html += "<tr>"
202
+ i = 0
203
+ while i < len(r.cells):
204
+ span = 1
205
+ c = r.cells[i]
206
+ for j in range(i+1, len(r.cells)):
207
+ if c.text == r.cells[j].text:
208
+ span += 1
209
+ i = j
210
+ i += 1
211
+ html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>"
212
+ html += "</tr>"
213
+ html += "</table>"
214
+ tbls.append(((None, html), ""))
215
+ return qai_list, tbls
216
+
217
  def rmPrefix(txt):
218
  return re.sub(
219
  r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:: ]+", "", txt.strip(), flags=re.IGNORECASE)
 
230
  add_positions(d, poss)
231
  return d
232
 
233
+ def beAdocDocx(d, q, a, eng, image):
234
+ qprefix = "Question: " if eng else "问题:"
235
+ aprefix = "Answer: " if eng else "回答:"
236
+ d["content_with_weight"] = "\t".join(
237
+ [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
238
+ d["content_ltks"] = rag_tokenizer.tokenize(q)
239
+ d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
240
+ d["image"] = image
241
+ return d
242
+
243
  def beAdoc(d, q, a, eng):
244
  qprefix = "Question: " if eng else "问题:"
245
  aprefix = "Answer: " if eng else "回答:"
 
254
  match = re.match(r'#*', s)
255
  return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
256
 
257
+ def docxQuestionLevel(p):
258
+ if p.style.name.startswith('Heading'):
259
+ return int(p.style.name.split(' ')[-1]), re.sub(r"\u3000", " ", p.text).strip()
260
+ else:
261
+ return 0, re.sub(r"\u3000", " ", p.text).strip()
262
 
263
  def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
264
  """
 
387
  if sum_question:
388
  res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
389
  return res
390
+ elif re.search(r"\.docx$", filename, re.IGNORECASE):
391
+ docx_parser = Docx()
392
+ qai_list, tbls = docx_parser(filename, binary,
393
+ from_page=0, to_page=10000, callback=callback)
394
+ res = tokenize_table(tbls, doc, eng)
395
+ for q, a, image in qai_list:
396
+ res.append(beAdocDocx(deepcopy(doc), q, a, eng, image))
397
+ return res
398
 
399
  raise NotImplementedError(
400
+ "Excel, csv(txt), pdf, markdown and docx format files are supported.")
401
 
402
 
403
  if __name__ == "__main__":